Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 41 additions & 13 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ jobs:
run-scala-tests:
<<: *test-defaults
# project/CirclePlugin.scala does its own test splitting in SBT based on CIRCLE_NODE_INDEX, CIRCLE_NODE_TOTAL
parallelism: 12
parallelism: 9
# Spark runs a lot of tests in parallel, we need 16 GB of RAM for this
resource_class: xlarge
steps:
Expand Down Expand Up @@ -416,28 +416,55 @@ jobs:
- store_artifacts:
path: /tmp/yarn-tests

deploy:
build-maven-versioned:
<<: *defaults
docker:
- image: palantirtechnologies/circle-spark-r
steps:
- *checkout-code
- attach_workspace:
at: .
- restore_cache:
key: maven-dependency-cache-{{ checksum "pom.xml" }}
- restore_cache:
key: build-binaries-{{ checksum "build/mvn" }}-{{ checksum "build/sbt" }}
- run:
command: dev/set_version_and_package.sh
# This is potentially costly but we can't use the workspace as it would conflict with
# compilation results from build-sbt
- save_cache:
key: v1-maven-build-with-version-{{ .Branch }}-{{ .Revision }}
paths:
- .

deploy:
<<: *defaults
# Some part of the maven setup fails if there's no R, so we need to use the R image here
docker:
- image: palantirtechnologies/circle-spark-r
steps:
# This cache contains the whole project after version was set and mvn package was called
# Restoring first (and instead of checkout) as mvn versions:set mutates real source code...
- restore_cache:
key: v1-maven-build-with-version-{{ .Branch }}-{{ .Revision }}
- restore_cache:
key: maven-dependency-cache-{{ checksum "pom.xml" }}
- restore_cache:
key: build-binaries-{{ checksum "build/mvn" }}-{{ checksum "build/sbt" }}

- run: echo "user=$BINTRAY_USERNAME" > .credentials
- run: echo "password=$BINTRAY_PASSWORD" >> .credentials
- run: echo "realm=Bintray API Realm" >> .credentials
- run: echo "host=api.bintray.com" >> .credentials
- deploy: dev/publish.sh
- deploy:
command: dev/publish.sh
- store_artifacts:
path: /tmp/make-distribution.log
destination: make-distribution.log
path: /tmp/make-dist.log
destination: make-dist.log
- store_artifacts:
path: /tmp/publish_artifacts.log
destination: publish_artifacts.log
- deploy: curl -u $BINTRAY_USERNAME:$BINTRAY_PASSWORD -X POST https://api.bintray.com/content/palantir/releases/spark/$(git describe --tags)/publish
- deploy:
command: |
curl -u $BINTRAY_USERNAME:$BINTRAY_PASSWORD -X POST https://api.bintray.com/content/palantir/releases/spark/$(git describe --tags)/publish

workflows:
version: 2
Expand Down Expand Up @@ -473,6 +500,10 @@ workflows:
requires:
- build-sbt
<<: *all-branches-and-tags
- build-maven-versioned:
requires:
- build-maven
<<: *deployable-branches-and-tags
- deploy:
requires:
- build-maven
Expand All @@ -483,8 +514,5 @@ workflows:
- run-scala-tests
- run-python-tests
- run-r-tests
filters:
tags:
only: /[0-9]+(?:\.[0-9]+){2,}-palantir\.[0-9]+(?:\.[0-9]+)*/
branches:
only: master
- build-maven-versioned
<<: *deployable-branches-and-tags
2 changes: 2 additions & 0 deletions dev/.rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ circle.yml
.credentials
publish.sh
publish-local.sh
publish_functions.sh
set_version_and_package.sh
structured-streaming/*
kafka-source-initial-offset-version-2.1.0.bin
kafka-source-initial-offset-future-version.bin
26 changes: 3 additions & 23 deletions dev/publish.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,10 @@
#!/usr/bin/env bash

set -euo pipefail
version=$(git describe --tags)

PALANTIR_FLAGS=(-Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Phive -Pyarn -Psparkr)
FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"

publish_artifacts() {
tmp_settings="tmp-settings.xml"
echo "<settings><servers><server>" > $tmp_settings
echo "<id>bintray-palantir-release</id><username>$BINTRAY_USERNAME</username>" >> $tmp_settings
echo "<password>$BINTRAY_PASSWORD</password>" >> $tmp_settings
echo "</server></servers></settings>" >> $tmp_settings

./build/mvn versions:set -DnewVersion=$version
./build/mvn --settings $tmp_settings -DskipTests "${PALANTIR_FLAGS[@]}" deploy
}

make_dist() {
build_flags="$1"
shift 1
hadoop_name="hadoop-palantir"
artifact_name="spark-dist_2.11-${hadoop_name}"
file_name="spark-dist-${version}-${hadoop_name}.tgz"
./dev/make-distribution.sh --name "hadoop-palantir" --tgz "$@" $build_flags | tee -a "/tmp/make-distribution.log"
curl -u $BINTRAY_USERNAME:$BINTRAY_PASSWORD -T $file_name "https://api.bintray.com/content/palantir/releases/spark/${version}/org/apache/spark/${artifact_name}/${version}/${artifact_name}-${version}.tgz"
}
source "$FWDIR/publish_functions.sh"

publish_artifacts | tee -a "/tmp/publish_artifacts.log"
make_dist "${PALANTIR_FLAGS[*]}" --clean
make_dist "${PALANTIR_FLAGS[*]}" | tee -a "/tmp/make-dist.log"
37 changes: 37 additions & 0 deletions dev/publish_functions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash

set -euo pipefail

PALANTIR_FLAGS=(-Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Phive -Pyarn -Psparkr)

get_version() {
git describe --tags --first-parent
}

set_version_and_package() {
version=$(get_version)
./build/mvn versions:set -DnewVersion="$version"
./build/mvn -DskipTests "${PALANTIR_FLAGS[@]}" package
}

publish_artifacts() {
tmp_settings="tmp-settings.xml"
echo "<settings><servers><server>" > $tmp_settings
echo "<id>bintray-palantir-release</id><username>$BINTRAY_USERNAME</username>" >> $tmp_settings
echo "<password>$BINTRAY_PASSWORD</password>" >> $tmp_settings
echo "</server></servers></settings>" >> $tmp_settings

./build/mvn --settings $tmp_settings -DskipTests "${PALANTIR_FLAGS[@]}" deploy
}

make_dist() {
build_flags="$1"
shift 1
version=$(get_version)
hadoop_name="hadoop-palantir"
artifact_name="spark-dist_2.11-${hadoop_name}"
file_name="spark-dist-${version}-${hadoop_name}.tgz"
./dev/make-distribution.sh --name "hadoop-palantir" --tgz "$@" $build_flags
curl -u $BINTRAY_USERNAME:$BINTRAY_PASSWORD -T "$file_name" "https://api.bintray.com/content/palantir/releases/spark/${version}/org/apache/spark/${artifact_name}/${version}/${artifact_name}-${version}.tgz"
curl -u $BINTRAY_USERNAME:$BINTRAY_PASSWORD -X POST "https://api.bintray.com/content/palantir/releases/spark/${version}/publish"
}
9 changes: 9 additions & 0 deletions dev/set_version_and_package.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

set -euo pipefail

FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"

source "$FWDIR/publish_functions.sh"

set_version_and_package
42 changes: 28 additions & 14 deletions project/CirclePlugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import sbt.Keys._
import sbt._
import sbt.plugins.JvmPlugin
import scalaz.Dequeue
import scalaz.Heap
import scalaz.Order

//noinspection ScalaStyle
object CirclePlugin extends AutoPlugin {
Expand Down Expand Up @@ -56,9 +58,9 @@ object CirclePlugin extends AutoPlugin {

case class TestKey(source: String, classname: String)

// Through this magical command we established that the average class run_time is 7.737
// Through this magical command we established that the average class run_time is 8.841
// jq <~/results.json '.tests | map(select(.result != "skipped")) | group_by(.classname) | map(map(.run_time) | add) | (add / length)'
private[this] val AVERAGE_TEST_CLASS_RUN_TIME = 7.737d
private[this] val AVERAGE_TEST_CLASS_RUN_TIME = 8.841d

override def globalSettings: Seq[Def.Setting[_]] = List(
circleTestsByProject := {
Expand Down Expand Up @@ -107,9 +109,9 @@ object CirclePlugin extends AutoPlugin {
.map(_.asScala)
.getOrElse(Iterator())
.toStream
.filter(_.result != "skipped") // don't count timings of tests that didn't run
.groupBy(result => TestKey(result.source, result.classname))
.mapValues(_.foldLeft(0.0d)(_ + _.run_time))
// don't count timings on skipped tests, but remember we've seen the classname
.mapValues(_.filter(_.result != "skipped").foldLeft(0.0d)(_ + _.run_time))
} catch {
case e: Exception =>
log.warn(f"Couldn't read test results file: $testResultsFile")
Expand Down Expand Up @@ -149,12 +151,19 @@ object CirclePlugin extends AutoPlugin {
val tests = Dequeue[(TestKey, Double)](
allTestsTimings.toIndexedSeq.sortBy { case (key, runTime) => (runTime, key) } : _*)

case class Group(tests: List[TestKey], runTime: Double)

implicit val groupOrder: Order[Group] = {
import scalaz.std.anyVal._
Order.orderBy(_.runTime)
}

@tailrec
def process(tests: Dequeue[(TestKey, Double)],
soFar: Double = 0d,
takeLeft: Boolean = true,
acc: List[TestKey] = Nil,
groups: List[List[TestKey]] = Nil): List[List[TestKey]] = {
groups: List[Group] = Nil): List[Group] = {

if (groups.size == totalNodes || tests.isEmpty) {
// Short circuit the logic if we've just completed the last group
Expand All @@ -163,12 +172,17 @@ object CirclePlugin extends AutoPlugin {
return if (tests.isEmpty) {
groups
} else {
// Fit all remaining tests in the last issued bucket.
val lastGroup +: restGroups = groups
val toFit = tests.toStream.map(_._1).force
log.info(s"Fitting remaining tests into first bucket (which already has " +
s"${lastGroup.size} tests): $toFit")
(toFit ++: lastGroup) :: restGroups
log.info(s"Fitting remaining tests into smallest buckets: $toFit")
// Fit all remaining tests into the least used buckets.
// import needed for creating Heap from List (needs Foldable[List[_]])
import scalaz.std.list._
tests.foldLeft(Heap.fromData(groups)) { case(heap, (test, runTime)) =>
heap.uncons match {
case Some((group, rest)) =>
rest.insert(group.copy(test :: group.tests, runTime + group.runTime))
}
}.toList
}
}

Expand All @@ -192,7 +206,7 @@ object CirclePlugin extends AutoPlugin {
case x@TestCandidate((_, runTime), _, _) if soFar + runTime <= timePerNode => x
} match {
case None =>
process(tests, 0d, takeLeft = true, Nil, acc :: groups)
process(tests, 0d, takeLeft = true, Nil, Group(acc, soFar) :: groups)
case Some(TestCandidate((key, runTime), rest, fromLeft)) =>
process(rest, soFar + runTime, fromLeft, key :: acc, groups)
}
Expand All @@ -202,11 +216,11 @@ object CirclePlugin extends AutoPlugin {
val rootTarget = (target in LocalRootProject).value
val bucketsFile = rootTarget / "tests-by-bucket.json"
log.info(s"Saving test distribution into $totalNodes buckets to: $bucketsFile")
mapper.writeValue(bucketsFile, buckets)
val timingsPerBucket = buckets.map(_.iterator.map(allTestsTimings.apply).sum)
mapper.writeValue(bucketsFile, buckets.map(_.tests))
val timingsPerBucket = buckets.map(_.tests.iterator.map(allTestsTimings.apply).sum)
log.info(s"Estimated test timings per bucket: $timingsPerBucket")

val bucket = buckets.lift.apply(index).getOrElse(Nil)
val bucket = buckets.map(_.tests).lift.apply(index).getOrElse(Nil)

val groupedByProject = bucket.flatMap(testsByKey.apply)
.groupBy(_.project)
Expand Down