From dc8c133c477856efe19c417c987aeee201c48611 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 27 Sep 2019 13:35:07 -0700 Subject: [PATCH 01/22] WIP --- README.md | 36 -------------------------- README.rst | 58 ++++++++++++++++++++++++++++++++++++++++++ python/environment.yml | 1 + python/setup.py | 23 +++++++++++++++++ 4 files changed, 82 insertions(+), 36 deletions(-) delete mode 100644 README.md create mode 100644 README.rst create mode 100644 python/setup.py diff --git a/README.md b/README.md deleted file mode 100644 index 3c1afbeb3..000000000 --- a/README.md +++ /dev/null @@ -1,36 +0,0 @@ -[![CircleCI](https://circleci.com/gh/databricks/spark-genomics.svg?style=svg&circle-token=31dc0fb939711565583c10d783f424ad2fb81e38)](https://circleci.com/gh/databricks/spark-genomics) - -# Building and Testing -This project is built using sbt: https://www.scala-sbt.org/1.0/docs/Setup.html - -Start an sbt shell using the `sbt` command. - -To compile the main code: -``` -compile -``` - -To run all tests: -``` -test -``` - -To test a specific suite: -``` -testOnly *VCFDataSourceSuite -``` - -If you use IntelliJ, you'll want to set up [scalafmt on save](https://scalameta.org/scalafmt/docs/installation.html). - -To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to 'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the definition of options in groupByHash function in build.sbt to -``` -val options = ForkOptions().withRunJVMOptions(Vector("-Xmx1024m")).withRunJVMOptions(Vector("-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005")) -``` - -To run Python tests, you must install and activate the conda environment in -`python/environment.yml`. You can then run tests from sbt: -``` -python/test -``` - -These tests will run with the same Spark classpath as the Scala tests. diff --git a/README.rst b/README.rst new file mode 100644 index 000000000..917e9b9a5 --- /dev/null +++ b/README.rst @@ -0,0 +1,58 @@ +================================== +Glow: Genomics on Apache Spark SQL +================================== + +Glow is a library to ETL genomics data using Apache Spark SQL. + +|circle-ci| + +.. |circle-ci| image:: https://circleci.com/gh/databricks/spark-genomics.svg?style=svg&circle-token=31dc0fb939711565583c10d783f424ad2fb81e38 + :target: https://circleci.com/gh/databricks/spark-genomics + +Building and Testing +-------------------- +This project is built using sbt_. + +.. _sbt: https://www.scala-sbt.org/1.0/docs/Setup.html + +Start an sbt shell using the ``sbt`` command. + +To compile the main code: + +.. code-block:: sh + + compile + + +To run all tests: + +.. code-block:: sh + + test + +To test a specific suite: + +.. code-block:: sh + + testOnly *VCFDataSourceSuite + +If you use IntelliJ, you'll want to set up `scalafmt on save`_. + +.. _scalafmt on save: https://scalameta.org/scalafmt/docs/installation.html + +To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to 'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the definition of ``options`` in ``groupByHash`` function in ``build.sbt`` to: + +.. code-block:: scala + + val options = ForkOptions().withRunJVMOptions(Vector("-Xmx1024m")) + .withRunJVMOptions(Vector("-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005")) + + +To run Python tests, you must install and activate the conda environment in +``python/environment.yml``. You can then run tests from sbt: + +.. code-block:: sh + + python/test + +These tests will run with the same Spark classpath as the Scala tests. diff --git a/python/environment.yml b/python/environment.yml index 410c8ec18..fefd51db1 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -5,4 +5,5 @@ dependencies: - pip - pip: - pyspark==2.4.2 + - setuptools==41.2.0 - typeguard==2.5.0 diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000..7404946cc --- /dev/null +++ b/python/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup + +setup( + name='pyglow', + version='1.0.0', + packages=['db_genomics'], + install_requires=[ + 'pyspark==2.4.2', + 'pytest', + 'typeguard==2.5.0', + ], + zip_safe=False, + author='Glow Project', + description='Glow: Genomics on Apache Spark SQL', + long_description=open('README.rst').read(), + license='Apache License 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'Programming Language :: Python :: 3.6', + ], + keywords=['databricks'], + url='https://glow-genomics.org/' +) From 9d89a8bb2ce5ee5fe61f89fea42bb370c2474544 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 3 Oct 2019 19:47:14 -0400 Subject: [PATCH 02/22] Get jar working Don't use Kryo serializer Don't parallelize un-serializable Hadoop FileStatus Change descrip WIP Whoops bintray Not local Quiet logs Remove tmp file Actually rename bintray Setting version to 0.1.0 WIP WIP License fixup Resolver WIP Change version Setting version to 0.1.1 WIP Setting version to 0.1.2 Setting version to 0.1.3-SNAPSHOT WIP Setting version to 0.1.2 Setting version to 0.1.3-SNAPSHOT Exclude many GATK deps Setting version to 0.1.3 Setting version to 0.1.4-SNAPSHOT Setting version to 0.1.4 Setting version to 0.1.5-SNAPSHOT Whoops Setting version to 0.1.3 Setting version to 0.1.4-SNAPSHOT Setting version to 0.1.4 Setting version to 0.1.5-SNAPSHOT Setting version to 0.1.6 Setting version to 0.1.7-SNAPSHOT Yay deps Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT Setting version to 0.1.8 Setting version to 0.1.9-SNAPSHOT Setting version to 0.1.1 Setting version to 0.1.2-SNAPSHOT Setting version to 0.1.10 Setting version to 0.1.11-SNAPSHOT Setting version to 0.1.15 Setting version to 0.1.16-SNAPSHOT Setting version to 0.1.9 Setting version to 0.1.10-SNAPSHOT WIP Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT Setting version to 0.1.8 Setting version to 0.1.9-SNAPSHOT Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT Add tests back Setting version to 0.1.8 Setting version to 0.1.9-SNAPSHOT Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT Setting version to 0.1.13 Setting version to 0.1.14-SNAPSHOT WIP Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT WIP Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT WIP Setting version to 0.1.8 Setting version to 0.1.9-SNAPSHOT Setting version to 0.1.11 Setting version to 0.1.12-SNAPSHOT Setting version to 0.1.7 Setting version to 0.1.8-SNAPSHOT Exclude findbugs Setting version to 0.1.8 Setting version to 0.1.9-SNAPSHOT WIP Cleanup --- .gitignore | 27 ++- README.rst | 6 +- build.sbt | 161 +++++++++++------- .../databricks/bgen/BgenSchemaInferrer.scala | 13 +- .../com/databricks/vcf/VCFFileFormat.scala | 8 +- core/src/test/resources/log4j.properties | 4 + .../com/databricks/hls/sql/HLSBaseTest.scala | 1 - project/plugins.sbt | 2 + python/setup.py | 8 +- version.sbt | 1 + 10 files changed, 149 insertions(+), 82 deletions(-) create mode 100644 version.sbt diff --git a/.gitignore b/.gitignore index d5f68b73d..c9248ce70 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,28 @@ +# Java targets target core/target project/target -.idea -*.swp -*.swo -adam.log + +# Distribution / packaging +build/ +dist/ +maven-repo/ +*.egg-info/ + +# Mac +.DS_Store + +# Byte-compiled / optimized / DLL files **/__pycache__ *.pyc -.DS_Store + +# Sphinx documentation docs/build + +# Editor files +*.swp +*.swo +.idea + +# ADAM logs +adam.log diff --git a/README.rst b/README.rst index 917e9b9a5..1365253d4 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,9 @@ If you use IntelliJ, you'll want to set up `scalafmt on save`_. .. _scalafmt on save: https://scalameta.org/scalafmt/docs/installation.html -To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to 'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the definition of ``options`` in ``groupByHash`` function in ``build.sbt`` to: +To ``test`` or ``testOnly`` in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to +'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the +definition of ``options`` in ``groupByHash`` function in ``build.sbt`` to: .. code-block:: scala @@ -49,7 +51,7 @@ To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug To run Python tests, you must install and activate the conda environment in -``python/environment.yml``. You can then run tests from sbt: +``python/environment.yml``. You can then run tests from ``sbt``: .. code-block:: sh diff --git a/build.sbt b/build.sbt index 82a138974..05b307dbc 100644 --- a/build.sbt +++ b/build.sbt @@ -5,10 +5,11 @@ val sparkVersion = "2.4.3" val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" -ThisBuild / version := "0.1.0-SNAPSHOT" +ThisBuild / version := "0.1.1" ThisBuild / organization := "com.databricks" ThisBuild / organizationName := "DB / RGC" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" +ThisBuild / publish / skip := true // Compile Java sources before Scala sources, so Scala code can depend on Java // but not vice versa @@ -29,7 +30,7 @@ def groupByHash(tests: Seq[TestDefinition]) = { .map { case (i, tests) => val options = ForkOptions() - .withRunJVMOptions(Vector("-Xmx1024m")) + .withRunJVMOptions(Vector("-Dspark.ui.enabled=false", "-Xmx1024m")) new Group(i.toString, tests, SubProcess(options)) } .toSeq @@ -57,46 +58,79 @@ lazy val commonSettings = Seq( scalacOptions += "-target:jvm-1.8" ) +lazy val dependencies = Seq( + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", + "org.apache.spark" %% "spark-core" % sparkVersion % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", + "org.seqdoop" % "hadoop-bam" % "7.9.1", + "log4j" % "log4j" % "1.2.17", + "org.slf4j" % "slf4j-api" % "1.7.16", + "org.slf4j" % "slf4j-log4j12" % "1.7.16", + "org.jdbi" % "jdbi" % "2.63.1", + "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", + // Exclude extraneous GATK dependencies + ("org.broadinstitute" % "gatk" % "4.0.11.0") + .exclude("biz.k11i", "xgboost-predictor") + .exclude("com.esotericsoftware", "kryo") + .exclude("com.esotericsoftware", "reflectasm") + .exclude("com.github.fommil.netlib", "netlib-native_ref-osx-x86_64") + .exclude("com.github.fommil.netlib", "netlib-native_ref-linux-x86_64") + .exclude("com.github.fommil.netlib", "netlib-native_system-linux-x86_64") + .exclude("com.github.fommil.netlib", "netlib-native_system-osx-x86_64") + .exclude("com.github.jsr203hadoop", "jsr203hadoop") + .exclude("com.google.cloud", "google-cloud-nio") + .exclude("com.google.cloud.bigdataoss", "gcs-connector") + .exclude("com.intel", "genomicsdb") + .exclude("com.intel.gkl", "gkl") + .exclude("com.opencsv", "opencsv") + .exclude("commons-io", "commons-io") + .exclude("gov.nist.math.jama", "gov.nist.math.jama") + .exclude("it.unimi.dsi", "fastutil") + .exclude("org.aeonbits.owner", "owner") + .exclude("org.apache.commons", "commons-lang3") + .exclude("org.apache.commons", "commons-math3") + .exclude("org.apache.commons", "commons-collections4") + .exclude("org.apache.commons", "commons-vfs2") + .exclude("org.apache.hadoop", "hadoop-client") + .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") + .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") + .exclude("org.broadinstitute", "barclay") + .exclude("org.broadinstitute", "hdf5-java-bindings") + .exclude("org.broadinstitute", "gatk-native-bindings") + .exclude("org.broadinstitute", "gatk-bwamem-jni") + .exclude("org.broadinstitute", "gatk-fermilite-jni") + .exclude("org.jgrapht", "jgrapht-core") + .exclude("org.objenesis", "objenesis") + .exclude("org.ojalgo", "ojalgo") + .exclude("org.ojalgo", "ojalgo-commons-math3") + .exclude("org.reflections", "reflections") + .exclude("org.seqdoop", "hadoop-bam") + .exclude("org.xerial", "sqlite-jdbc"), + // Test dependencies + "org.scalatest" %% "scalatest" % "3.0.3" % "test", + "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", + "org.mockito" % "mockito-all" % "1.9.5" % "test", + "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", + "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", + "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", + "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test", + "org.xerial" % "sqlite-jdbc" % "3.20.1" % "test" +).map(_.exclude("com.google.code.findbugs", "jsr305")) + lazy val core = (project in file("core")) .settings( commonSettings, name := "spark-genomics", - libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "provided", - "org.apache.spark" %% "spark-core" % sparkVersion % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", - "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", - "com.github.samtools" % "htsjdk" % "2.20.0", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.9", - "org.seqdoop" % "hadoop-bam" % "7.9.1", - "log4j" % "log4j" % "1.2.17", - "org.slf4j" % "slf4j-api" % "1.7.16", - "org.slf4j" % "slf4j-log4j12" % "1.7.16", - "org.jdbi" % "jdbi" % "2.63.1", - "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", - // Exclude extraneous GATK dependencies - ("org.broadinstitute" % "gatk" % "4.0.11.0") - .exclude("biz.k11i", "xgboost-predictor") - .exclude("com.google.cloud.bigdataoss", "gcs-connector") - .exclude("com.intel", "genomicsdb") - .exclude("org.apache.spark", s"spark-mllib_$scalaMajorMinor") - .exclude("org.bdgenomics.adam", s"adam-core-spark2_$scalaMajorMinor") - .exclude("com.google.cloud", "google-cloud-nio"), - // Test dependencies - "org.scalatest" %% "scalatest" % "3.0.3" % "test", - "org.scalacheck" %% "scalacheck" % "1.12.5" % "test", - "org.mockito" % "mockito-all" % "1.9.5" % "test", - "org.apache.spark" %% "spark-core" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-catalyst" % sparkVersion % "test" classifier "tests", - "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests", - "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.28.0" % "test", - "org.bdgenomics.bdg-formats" % "bdg-formats" % "0.11.3" % "test" - ), + publish / skip := false, + libraryDependencies ++= dependencies, // Fix versions of libraries that are depended on multiple times dependencyOverrides ++= Seq( "org.apache.hadoop" % "hadoop-client" % "2.7.3", "io.netty" % "netty" % "3.9.9.Final", - "io.netty" % "netty-all" % "4.1.17.Final" + "io.netty" % "netty-all" % "4.1.17.Final", + "com.github.samtools" % "htsjdk" % "2.20.1" ) ) @@ -121,33 +155,38 @@ lazy val python = "SPARK_HOME" -> (ThisBuild / baseDirectory).value.absolutePath ).! require(ret == 0, "Python tests failed") - } + }, + publish / skip := true ) -// Uncomment the following for publishing to Sonatype. -// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for more detail. +// Publish to Bintray +ThisBuild / description := "Glow: Genomics on Apache Spark" +ThisBuild / licenses := List( + "Apache-2.0" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) +ThisBuild / homepage := Some(url("https://github.com/databricks/spark-genomics")) +ThisBuild / scmInfo := Some( + ScmInfo( + url("https://github.com/databricks/spark-genomics"), + "scm:git@github.com:databricks/spark-genomics.git" + ) +) +ThisBuild / pomIncludeRepository := { _ => + false +} +ThisBuild / publishMavenStyle := true + +ThisBuild / bintrayOrganization := Some("databricks") -// ThisBuild / description := "Some descripiton about your project." -// ThisBuild / licenses := List("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) -// ThisBuild / homepage := Some(url("https://github.com/example/project")) -// ThisBuild / scmInfo := Some( -// ScmInfo( -// url("https://github.com/your-account/your-project"), -// "scm:git@github.com:your-account/your-project.git" -// ) -// ) -// ThisBuild / developers := List( -// Developer( -// id = "Your identifier", -// name = "Your Name", -// email = "your@email", -// url = url("http://your.url") -// ) -// ) -// ThisBuild / pomIncludeRepository := { _ => false } -// ThisBuild / publishTo := { -// val nexus = "https://oss.sonatype.org/" -// if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") -// else Some("releases" at nexus + "service/local/staging/deploy/maven2") -// } -// ThisBuild / publishMavenStyle := true +import ReleaseTransformations._ + +releaseProcess := Seq[ReleaseStep]( + checkSnapshotDependencies, + inquireVersions, + runTest, + setReleaseVersion, + commitReleaseVersion, + tagRelease, + publishArtifacts, + setNextVersion, + commitNextVersion +) diff --git a/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala b/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala index 936d11a52..f6ddda0b5 100644 --- a/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala +++ b/core/src/main/scala/com/databricks/bgen/BgenSchemaInferrer.scala @@ -30,19 +30,20 @@ object BgenSchemaInferrer { val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration) val ignoreExtension = options.get(BgenFileFormat.IGNORE_EXTENSION_KEY).exists(_.toBoolean) - val bgenFiles = + val bgenPaths = files.filter { fs => fs.getLen > 0 && (fs .getPath .toString .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension) - } + }.map(_.getPath.toString) val hasSampleIds = spark .sparkContext - .parallelize(bgenFiles) - .map { fs => - val hadoopFs = fs.getPath.getFileSystem(serializableConf.value) - WithUtils.withCloseable(hadoopFs.open(fs.getPath)) { stream => + .parallelize(bgenPaths) + .map { path => + val hPath = new Path(path) + val hadoopFs = hPath.getFileSystem(serializableConf.value) + WithUtils.withCloseable(hadoopFs.open(hPath)) { stream => val littleEndianDataInputStream = new LittleEndianDataInputStream(stream) new BgenHeaderReader(littleEndianDataInputStream) .readHeader(None) diff --git a/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala b/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala index 971407232..041f1ba5c 100644 --- a/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala +++ b/core/src/main/scala/com/databricks/vcf/VCFFileFormat.scala @@ -442,13 +442,15 @@ private[vcf] object SchemaDelegate { files: Seq[FileStatus]): (Seq[VCFInfoHeaderLine], Seq[VCFFormatHeaderLine]) = { val serializableConf = new SerializableConfiguration(spark.sessionState.newHadoopConf()) + val filePaths = files.map(_.getPath.toString) spark .sparkContext - .parallelize(files.map(_.getPath.toString)) + .parallelize(filePaths) .map { path => val (header, _) = VCFFileFormat.createVCFCodec(path, serializableConf.value) - - (header.getInfoHeaderLines.asScala.toSeq, header.getFormatHeaderLines.asScala.toSeq) + val infoHeaderLines = header.getInfoHeaderLines.asScala.toSet + val formatHeaderLines = header.getFormatHeaderLines.asScala.toSet + (infoHeaderLines, formatHeaderLines) } .collect() .foldLeft((Seq.empty[VCFInfoHeaderLine], Seq.empty[VCFFormatHeaderLine])) { diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index 7b5a7da6d..97c38689c 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -9,3 +9,7 @@ log4j.logger.akka=ERROR log4j.logger.Remoting=ERROR log4j.logger.org.apache.spark=WARN log4j.logger.org.eclipse.jetty=ERROR +log4j.logger.org.apache.hadoop=ERROR + +# Quiet our logs +log4j.logger.com.databricks=ERROR diff --git a/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala b/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala index 3983a3a70..cfc489da1 100644 --- a/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala +++ b/core/src/test/scala/com/databricks/hls/sql/HLSBaseTest.scala @@ -19,7 +19,6 @@ abstract class HLSBaseTest override protected def sparkConf: SparkConf = { super .sparkConf - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.driver.maxResultSize", "0") .set("spark.kryo.registrator", "org.broadinstitute.hellbender.engine.spark.GATKRegistrator") .set("spark.kryoserializer.buffer.max", "2047m") diff --git a/project/plugins.sbt b/project/plugins.sbt index e97c33c28..f6a0ebe23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,7 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") +addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11") addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.0") addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") diff --git a/python/setup.py b/python/setup.py index 7404946cc..df4315f0d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -9,15 +9,15 @@ 'pytest', 'typeguard==2.5.0', ], - zip_safe=False, author='Glow Project', - description='Glow: Genomics on Apache Spark SQL', - long_description=open('README.rst').read(), + description='Glow: Genomics on Apache Spark', + long_description=open('../README.rst').read(), + long_description_content_type='text/x-rst', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Developers', 'Programming Language :: Python :: 3.6', ], keywords=['databricks'], - url='https://glow-genomics.org/' + url='https://github.com/databricks/spark-genomics' ) diff --git a/version.sbt b/version.sbt new file mode 100644 index 000000000..9d9021feb --- /dev/null +++ b/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.9-SNAPSHOT" From d2c3a1331b83edefcdf304afda51a718f6c4450d Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 7 Oct 2019 11:37:47 -0400 Subject: [PATCH 03/22] Rename org --- .gitignore | 1 + build.sbt | 12 ++++++------ version.sbt | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) delete mode 100644 version.sbt diff --git a/.gitignore b/.gitignore index c9248ce70..f3af64e6e 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ docs/build # Editor files *.swp *.swo +*.iml .idea # ADAM logs diff --git a/build.sbt b/build.sbt index 05b307dbc..e5b9d33b9 100644 --- a/build.sbt +++ b/build.sbt @@ -5,8 +5,8 @@ val sparkVersion = "2.4.3" val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" -ThisBuild / version := "0.1.1" -ThisBuild / organization := "com.databricks" +ThisBuild / version := "0.1.0" +ThisBuild / organization := "projectglow" ThisBuild / organizationName := "DB / RGC" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" ThisBuild / publish / skip := true @@ -122,7 +122,7 @@ lazy val dependencies = Seq( lazy val core = (project in file("core")) .settings( commonSettings, - name := "spark-genomics", + name := "glow", publish / skip := false, libraryDependencies ++= dependencies, // Fix versions of libraries that are depended on multiple times @@ -163,11 +163,11 @@ lazy val python = ThisBuild / description := "Glow: Genomics on Apache Spark" ThisBuild / licenses := List( "Apache-2.0" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) -ThisBuild / homepage := Some(url("https://github.com/databricks/spark-genomics")) +ThisBuild / homepage := Some(url("https://github.com/projectglow/glow")) ThisBuild / scmInfo := Some( ScmInfo( - url("https://github.com/databricks/spark-genomics"), - "scm:git@github.com:databricks/spark-genomics.git" + url("https://github.com/projectglow/glow"), + "scm:git@github.com:projectglow/glow.git" ) ) ThisBuild / pomIncludeRepository := { _ => diff --git a/version.sbt b/version.sbt deleted file mode 100644 index 9d9021feb..000000000 --- a/version.sbt +++ /dev/null @@ -1 +0,0 @@ -version in ThisBuild := "0.1.9-SNAPSHOT" From b8fed938b5779b248d40eeadab67d254de7b4eb1 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 7 Oct 2019 14:07:13 -0400 Subject: [PATCH 04/22] Rename env --- python/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/environment.yml b/python/environment.yml index fefd51db1..79ebd6c2b 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -1,4 +1,4 @@ -name: spark-genomics +name: glow dependencies: - python=3.7 - pytest From 987609644521e7464581afe51f4d8bbfe937327b Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 7 Oct 2019 14:10:12 -0400 Subject: [PATCH 05/22] Setting version to 0.1.0 --- version.sbt | 1 + 1 file changed, 1 insertion(+) create mode 100644 version.sbt diff --git a/version.sbt b/version.sbt new file mode 100644 index 000000000..e76544405 --- /dev/null +++ b/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.0" From 8dc77ae679025169daa4da92533c52ef5f2907d5 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 7 Oct 2019 14:10:37 -0400 Subject: [PATCH 06/22] Setting version to 0.1.1-SNAPSHOT --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index e76544405..1be9a631a 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.1.0" +version in ThisBuild := "0.1.1-SNAPSHOT" From f994805ebf804409e9a0866640f791160d170499 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 7 Oct 2019 14:27:21 -0400 Subject: [PATCH 07/22] Rename --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index df4315f0d..69065777d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -19,5 +19,5 @@ 'Programming Language :: Python :: 3.6', ], keywords=['databricks'], - url='https://github.com/databricks/spark-genomics' + url='https://github.com/projectglow/glow' ) From 26d62d6b86e570a10eee4580eee5bccbbd3aa6fe Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 8 Oct 2019 04:46:06 -0400 Subject: [PATCH 08/22] Work on test.pypi --- python/environment.yml | 1 + python/setup.py | 12 +++++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/python/environment.yml b/python/environment.yml index 79ebd6c2b..2ad9a6cad 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -7,3 +7,4 @@ dependencies: - pyspark==2.4.2 - setuptools==41.2.0 - typeguard==2.5.0 + - twine==2.0.0 \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 69065777d..55f72f137 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,14 +1,12 @@ from setuptools import setup setup( - name='pyglow', - version='1.0.0', + name='glowpy', + version='1.0.2', packages=['db_genomics'], - install_requires=[ - 'pyspark==2.4.2', - 'pytest', - 'typeguard==2.5.0', - ], + # install_requires=[ + # 'typeguard==2.5.0', + # ], author='Glow Project', description='Glow: Genomics on Apache Spark', long_description=open('../README.rst').read(), From d33282384996c3ef2b722098e5e7377817993f09 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 8 Oct 2019 05:00:51 -0400 Subject: [PATCH 09/22] Move build/test from README to wiki --- README.rst | 52 +--------------------------------------------------- 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/README.rst b/README.rst index 965325371..efc307a2a 100644 --- a/README.rst +++ b/README.rst @@ -2,59 +2,9 @@ Glow: Genomics on Apache Spark SQL ================================== -Glow is a library to ETL genomics data using Apache Spark SQL. - |circle-ci| .. |circle-ci| image:: https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d :target: https://circleci.com/gh/projectglow/glow -Building and Testing --------------------- -This project is built using sbt_. - -.. _sbt: https://www.scala-sbt.org/1.0/docs/Setup.html - -Start an sbt shell using the ``sbt`` command. - -To compile the main code: - -.. code-block:: sh - - compile - - -To run all tests: - -.. code-block:: sh - - test - -To test a specific suite: - -.. code-block:: sh - - testOnly *VCFDataSourceSuite - -If you use IntelliJ, you'll want to set up `scalafmt on save`_. - -.. _scalafmt on save: https://scalameta.org/scalafmt/docs/installation.html - -To ``test`` or ``testOnly`` in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to -'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the -definition of ``options`` in ``groupByHash`` function in ``build.sbt`` to: - -.. code-block:: scala - - val options = ForkOptions().withRunJVMOptions(Vector("-Xmx1024m")) - .withRunJVMOptions(Vector("-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005")) - - -To run Python tests, you must install and activate the conda environment in -``python/environment.yml``. You can then run tests from ``sbt``: - -.. code-block:: sh - - python/test - -These tests will run with the same Spark classpath as the Scala tests. +Glow is a library to ETL genomics data using Apache Spark SQL. From 7d367370ec979d84dff29f6fb6215a68c241f06e Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 8 Oct 2019 11:14:35 +0200 Subject: [PATCH 10/22] More cleanup --- python/setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 55f72f137..07afb051d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -2,7 +2,7 @@ setup( name='glowpy', - version='1.0.2', + version='1.0.3', packages=['db_genomics'], # install_requires=[ # 'typeguard==2.5.0', @@ -16,6 +16,5 @@ 'Intended Audience :: Developers', 'Programming Language :: Python :: 3.6', ], - keywords=['databricks'], url='https://github.com/projectglow/glow' ) From cc51344fd5ccbaad0238edb48dbd09a1d8a97fa2 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 8 Oct 2019 11:15:18 +0200 Subject: [PATCH 11/22] Newline --- python/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/environment.yml b/python/environment.yml index 2ad9a6cad..1dc6a1fb0 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -7,4 +7,4 @@ dependencies: - pyspark==2.4.2 - setuptools==41.2.0 - typeguard==2.5.0 - - twine==2.0.0 \ No newline at end of file + - twine==2.0.0 From 8e00e3ef54564c45894f0cfdf6fbecfa48abf4c7 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 8 Oct 2019 11:16:53 +0200 Subject: [PATCH 12/22] Test CircleCI From d30e21ab26e8f3dad09b90134ea73879dc448d76 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 9 Oct 2019 15:13:28 +0200 Subject: [PATCH 13/22] address comments --- README.md | 36 ------------------------ core/src/test/resources/log4j.properties | 3 -- python/environment.yml | 4 +-- python/setup.py | 8 +++--- 4 files changed, 6 insertions(+), 45 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 759a24ac2..000000000 --- a/README.md +++ /dev/null @@ -1,36 +0,0 @@ -[![CircleCI](https://circleci.com/gh/projectglow/glow.svg?style=svg&circle-token=7511f70b2c810a18e88b5c537b0410e82db8617d)](https://circleci.com/gh/projectglow/glow) - -# Building and Testing -This project is built using sbt: https://www.scala-sbt.org/1.0/docs/Setup.html - -Start an sbt shell using the `sbt` command. - -To compile the main code: -``` -compile -``` - -To run all tests: -``` -test -``` - -To test a specific suite: -``` -testOnly *VCFDataSourceSuite -``` - -If you use IntelliJ, you'll want to set up [scalafmt on save](https://scalameta.org/scalafmt/docs/installation.html). - -To test or testOnly in remote debug mode with IntelliJ IDEA set the remote debug configuration in IntelliJ to 'Attach to remote JVM' mode and a specific port number (here the default port number 5005 is used) and then modify the definition of options in groupByHash function in build.sbt to -``` -val options = ForkOptions().withRunJVMOptions(Vector("-Xmx1024m")).withRunJVMOptions(Vector("-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005")) -``` - -To run Python tests, you must install and activate the conda environment in -`python/environment.yml`. You can then run tests from sbt: -``` -python/test -``` - -These tests will run with the same Spark classpath as the Scala tests. diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index 97c38689c..0015ee871 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -10,6 +10,3 @@ log4j.logger.Remoting=ERROR log4j.logger.org.apache.spark=WARN log4j.logger.org.eclipse.jetty=ERROR log4j.logger.org.apache.hadoop=ERROR - -# Quiet our logs -log4j.logger.com.databricks=ERROR diff --git a/python/environment.yml b/python/environment.yml index 1dc6a1fb0..669afc851 100644 --- a/python/environment.yml +++ b/python/environment.yml @@ -5,6 +5,6 @@ dependencies: - pip - pip: - pyspark==2.4.2 - - setuptools==41.2.0 + - setuptools==41.2.0 # Python packaging - typeguard==2.5.0 - - twine==2.0.0 + - twine==2.0.0 # Pypi publishing diff --git a/python/setup.py b/python/setup.py index 07afb051d..d745c5b5e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -4,9 +4,9 @@ name='glowpy', version='1.0.3', packages=['db_genomics'], - # install_requires=[ - # 'typeguard==2.5.0', - # ], + install_requires=[ + 'typeguard==2.5.0', + ], author='Glow Project', description='Glow: Genomics on Apache Spark', long_description=open('../README.rst').read(), @@ -14,7 +14,7 @@ license='Apache License 2.0', classifiers=[ 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], url='https://github.com/projectglow/glow' ) From 0bb56ed4cd6109a461846590899854dce80b2ab9 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 9 Oct 2019 15:24:11 +0200 Subject: [PATCH 14/22] Circleci fixups --- .circleci/config.yml | 12 ++++++------ build.sbt | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1d4458de6..23ceb08c0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,17 +1,17 @@ version: 2.1 jobs: test: - working_directory: ~/spark-genomics + working_directory: ~/glow docker: - image: circleci/openjdk:8 steps: - - restore_cache: - keys: - - conda-deps-v1-{{ checksum "python/environment.yml" }} - - checkout + - restore_cache: + keys: + - conda-deps-v1-{{ checksum "python/environment.yml" }} + - run: name: install dependencies command: | @@ -28,7 +28,7 @@ jobs: name: run tests environment: command: | - export PATH=$HOME/conda/envs/spark-genomics/bin:$PATH + export PATH=$HOME/conda/envs/glow/bin:$PATH sbt test exit - save_cache: diff --git a/build.sbt b/build.sbt index e5b9d33b9..6267452a3 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ val scalaMajorMinor = "2.11" ThisBuild / scalaVersion := s"$scalaMajorMinor.12" ThisBuild / version := "0.1.0" -ThisBuild / organization := "projectglow" +ThisBuild / organization := "org.projectglow" ThisBuild / organizationName := "DB / RGC" ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" ThisBuild / publish / skip := true @@ -139,7 +139,7 @@ lazy val python = .dependsOn(core % "test->test") .settings( unmanagedSourceDirectories in Compile := { - Seq(baseDirectory.value / "spark_genomics") + Seq(baseDirectory.value / "glow") }, test in Test := { // Pass the test classpath to pyspark so that we run the same bits as the Scala tests From c4c09d10e4750579063c0b6cdb7d4f77536a5c25 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 9 Oct 2019 15:26:46 +0200 Subject: [PATCH 15/22] Un-exclude netlib from gatk --- build.sbt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/build.sbt b/build.sbt index 6267452a3..fa96bdfcf 100644 --- a/build.sbt +++ b/build.sbt @@ -74,10 +74,6 @@ lazy val dependencies = Seq( .exclude("biz.k11i", "xgboost-predictor") .exclude("com.esotericsoftware", "kryo") .exclude("com.esotericsoftware", "reflectasm") - .exclude("com.github.fommil.netlib", "netlib-native_ref-osx-x86_64") - .exclude("com.github.fommil.netlib", "netlib-native_ref-linux-x86_64") - .exclude("com.github.fommil.netlib", "netlib-native_system-linux-x86_64") - .exclude("com.github.fommil.netlib", "netlib-native_system-osx-x86_64") .exclude("com.github.jsr203hadoop", "jsr203hadoop") .exclude("com.google.cloud", "google-cloud-nio") .exclude("com.google.cloud.bigdataoss", "gcs-connector") From 72366d2a7e6c23cc1323a25177b6f003d3b78474 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 9 Oct 2019 15:27:48 +0200 Subject: [PATCH 16/22] CircleCI indents --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 23ceb08c0..8c804142b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,8 +9,8 @@ jobs: - checkout - restore_cache: - keys: - - conda-deps-v1-{{ checksum "python/environment.yml" }} + keys: + - conda-deps-v1-{{ checksum "python/environment.yml" }} - run: name: install dependencies From 301e2d8efdbd1278ffb651cd86f7e6bf6ffb7e78 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 10 Oct 2019 16:31:15 +0200 Subject: [PATCH 17/22] Change bintray org --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index fa96bdfcf..9ee977b3f 100644 --- a/build.sbt +++ b/build.sbt @@ -171,7 +171,7 @@ ThisBuild / pomIncludeRepository := { _ => } ThisBuild / publishMavenStyle := true -ThisBuild / bintrayOrganization := Some("databricks") +ThisBuild / bintrayOrganization := Some("projectglow") import ReleaseTransformations._ From f6e667f46f64b0ce9edbd15a263425cc062707ff Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 10 Oct 2019 16:41:37 +0200 Subject: [PATCH 18/22] Setting version to 0.1.0 --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index 1be9a631a..e76544405 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.1.1-SNAPSHOT" +version in ThisBuild := "0.1.0" From 2a2bb612ee11fe08f52472172641d453d0727d46 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 10 Oct 2019 16:53:14 +0200 Subject: [PATCH 19/22] Bintray repo --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index 9ee977b3f..10e635f49 100644 --- a/build.sbt +++ b/build.sbt @@ -172,6 +172,7 @@ ThisBuild / pomIncludeRepository := { _ => ThisBuild / publishMavenStyle := true ThisBuild / bintrayOrganization := Some("projectglow") +ThisBuild / bintrayRepository := "glow" import ReleaseTransformations._ From 1d2aa32899c1bb0855449e855485e22a21712a15 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 10 Oct 2019 17:04:51 +0200 Subject: [PATCH 20/22] Move bintrayrepo --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index 10e635f49..a65d070b8 100644 --- a/build.sbt +++ b/build.sbt @@ -120,6 +120,7 @@ lazy val core = (project in file("core")) commonSettings, name := "glow", publish / skip := false, + bintrayRepository := "glow", libraryDependencies ++= dependencies, // Fix versions of libraries that are depended on multiple times dependencyOverrides ++= Seq( From 015ca20723c999902e2852704d5436982dd9b9cc Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 10 Oct 2019 17:05:56 +0200 Subject: [PATCH 21/22] Setting version to 0.1.1-SNAPSHOT --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index e76544405..1be9a631a 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.1.0" +version in ThisBuild := "0.1.1-SNAPSHOT" From e3cdb017e4498df9d143830e080a1ceb30ca91b5 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 11 Oct 2019 00:13:16 +0200 Subject: [PATCH 22/22] Whoops --- .circleci/config.yml | 2 -- .gitignore | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 97737a36e..e02464e63 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,8 +7,6 @@ jobs: steps: - checkout - - checkout - - restore_cache: keys: - conda-deps-v1-{{ checksum "python/environment.yml" }} diff --git a/.gitignore b/.gitignore index a68194dc1..83766268e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,6 @@ docs/build *.iml .idea -# ADAM logs +# Logs adam.log unit-tests.log