Skip to content

Commit f9c9cd8

Browse files
committed
[SC-5599] Inline spark-redshift sources into databricks/spark
This patch ports `spark-redshift` as of databricks/spark-avro@b01a034 and updates it to run with Spark 2.1.0. I didn't make any attempts to clean up the library, remove dead code, or modernize its dependencies. Each of these tasks will take some time and I'm hoping to delegate them out once we have the infrastructure pieces in place. Credentials are stored securely in Jenkins. If you need a copy of them to do local development, talk to Josh and he'll share them via ZeroBin (this will be done via LastPass in the future). Author: Josh Rosen <[email protected]> Closes apache#171 from JoshRosen/add-spark-redshift.
1 parent 227ff64 commit f9c9cd8

File tree

54 files changed

+6128
-6
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+6128
-6
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright (C) 2016 Databricks, Inc.
3+
*
4+
* Portions of this software incorporate or are derived from software contained within Apache Spark,
5+
* and this modified software differs from the Apache Spark software provided under the Apache
6+
* License, Version 2.0, a copy of which you may obtain at
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*/
9+
10+
package org.apache.spark.tags;
11+
12+
import java.lang.annotation.*;
13+
14+
import org.scalatest.TagAnnotation;
15+
16+
@TagAnnotation
17+
@Retention(RetentionPolicy.RUNTIME)
18+
@Target({ElementType.METHOD, ElementType.TYPE})
19+
public @interface ExtendedRedshiftTest { }

dev/.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,4 @@ org.apache.spark.scheduler.ExternalClusterManager
103103
org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
104104
spark-warehouse
105105
structured-streaming/*
106+
install-redshift-jdbc.sh

dev/install-redshift-jdbc.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
6+
SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
7+
8+
cd /tmp
9+
10+
VERSION='1.1.7.1007'
11+
FILENAME="RedshiftJDBC4-$VERSION.jar"
12+
13+
wget "https://s3.amazonaws.com/redshift-downloads/drivers/$FILENAME"
14+
15+
$SPARK_ROOT_DIR/build/mvn install:install-file \
16+
-Dfile=$FILENAME \
17+
-DgroupId=com.amazonaws \
18+
-DartifactId=redshift.jdbc4 \
19+
-Dversion=$VERSION \
20+
-Dpackaging=jar
21+

dev/run-tests.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ def determine_modules_to_test(changed_modules):
111111
>>> x = [x.name for x in determine_modules_to_test([modules.sql])]
112112
>>> x # doctest: +NORMALIZE_WHITESPACE
113113
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'sql-kafka-0-8', 'examples',
114-
'hive-thriftserver', 'pyspark-sql', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
114+
'hive-thriftserver', 'pyspark-sql', 'redshift', 'sparkr', 'pyspark-mllib',
115+
'redshift-integration-tests', 'pyspark-ml']
115116
"""
116117
modules_to_test = set()
117118
for module in changed_modules:
@@ -512,6 +513,8 @@ def main():
512513
test_env = "amplab_jenkins"
513514
# add path for Python3 in Jenkins if we're calling from a Jenkins machine
514515
os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
516+
# Install Redshift JDBC
517+
run_cmd([os.path.join(SPARK_HOME, "dev", "install-redshift-jdbc.sh")])
515518
else:
516519
# else we're running locally and can use local settings
517520
build_tool = "sbt"

dev/sparktestsupport/modules.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,31 @@ def __hash__(self):
168168
]
169169
)
170170

171+
redshift = Module(
172+
name="redshift",
173+
dependencies=[avro, sql],
174+
source_file_regexes=[
175+
"external/redshift",
176+
],
177+
sbt_test_goals=[
178+
"redshift/test",
179+
],
180+
test_tags=[
181+
"org.apache.spark.tags.ExtendedRedshiftTest"
182+
],
183+
)
184+
185+
redshift_integration_tests = Module(
186+
name="redshift-integration-tests",
187+
dependencies=[redshift],
188+
source_file_regexes=[
189+
"external/redshift-integration-tests",
190+
],
191+
sbt_test_goals=[
192+
"redshift-integration-tests/test",
193+
],
194+
)
195+
171196
sql_kafka = Module(
172197
name="sql-kafka-0-10",
173198
dependencies=[sql],
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
/*
4+
* Copyright (C) 2016 Databricks, Inc.
5+
*
6+
* Portions of this software incorporate or are derived from software contained within Apache Spark,
7+
* and this modified software differs from the Apache Spark software provided under the Apache
8+
* License, Version 2.0, a copy of which you may obtain at
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*/
11+
-->
12+
13+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
14+
<modelVersion>4.0.0</modelVersion>
15+
<parent>
16+
<groupId>org.apache.spark</groupId>
17+
<artifactId>spark-parent_2.11</artifactId>
18+
<version>2.1.0</version>
19+
<relativePath>../../pom.xml</relativePath>
20+
</parent>
21+
22+
<groupId>com.databricks</groupId>
23+
<artifactId>spark-redshift-integration-tests_2.11</artifactId>
24+
<properties>
25+
<sbt.project.name>redshift-integration-tests</sbt.project.name>
26+
</properties>
27+
<packaging>jar</packaging>
28+
<name>Spark Redshift Integration Tests</name>
29+
<url>http://spark.apache.org/</url>
30+
31+
<dependencies>
32+
<dependency>
33+
<groupId>org.apache.spark</groupId>
34+
<artifactId>spark-sql_${scala.binary.version}</artifactId>
35+
<version>${project.version}</version>
36+
<scope>provided</scope>
37+
</dependency>
38+
<dependency>
39+
<groupId>com.databricks</groupId>
40+
<artifactId>spark-avro_${scala.binary.version}</artifactId>
41+
<version>${project.version}</version>
42+
<scope>provided</scope>
43+
</dependency>
44+
<dependency>
45+
<groupId>com.databricks</groupId>
46+
<artifactId>spark-redshift_${scala.binary.version}</artifactId>
47+
<version>${project.version}</version>
48+
<scope>provided</scope>
49+
</dependency>
50+
<dependency>
51+
<groupId>com.databricks</groupId>
52+
<artifactId>spark-redshift_${scala.binary.version}</artifactId>
53+
<version>${project.version}</version>
54+
<type>test-jar</type>
55+
<scope>test</scope>
56+
</dependency>
57+
<!-- The AWS Java SDK version here should match (or be below) the version used in DBC images -->
58+
<dependency>
59+
<groupId>com.amazonaws</groupId>
60+
<artifactId>aws-java-sdk-core</artifactId>
61+
<version>1.9.40</version>
62+
<scope>provided</scope>
63+
</dependency>
64+
<dependency>
65+
<groupId>com.amazonaws</groupId>
66+
<artifactId>aws-java-sdk-s3</artifactId>
67+
<version>1.9.40</version>
68+
<scope>provided</scope>
69+
</dependency>
70+
<dependency>
71+
<groupId>com.amazonaws</groupId>
72+
<artifactId>aws-java-sdk-sts</artifactId>
73+
<version>1.9.40</version>
74+
<scope>provided</scope>
75+
</dependency>
76+
<dependency>
77+
<groupId>com.eclipsesource.minimal-json</groupId>
78+
<artifactId>minimal-json</artifactId>
79+
<version>0.9.4</version>
80+
<scope>compile</scope>
81+
</dependency>
82+
<dependency>
83+
<groupId>org.apache.hadoop</groupId>
84+
<artifactId>hadoop-client</artifactId>
85+
<scope>test</scope>
86+
</dependency>
87+
<dependency>
88+
<groupId>org.apache.hadoop</groupId>
89+
<artifactId>hadoop-common</artifactId>
90+
<version>${hadoop.version}</version>
91+
<scope>test</scope>
92+
</dependency>
93+
<dependency>
94+
<groupId>org.apache.spark</groupId>
95+
<artifactId>spark-core_${scala.binary.version}</artifactId>
96+
<version>${project.version}</version>
97+
<type>test-jar</type>
98+
<scope>test</scope>
99+
</dependency>
100+
<dependency>
101+
<groupId>org.apache.spark</groupId>
102+
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
103+
<version>${project.version}</version>
104+
<type>test-jar</type>
105+
<scope>test</scope>
106+
</dependency>
107+
<dependency>
108+
<groupId>org.apache.spark</groupId>
109+
<artifactId>spark-sql_${scala.binary.version}</artifactId>
110+
<version>${project.version}</version>
111+
<type>test-jar</type>
112+
<scope>test</scope>
113+
</dependency>
114+
<dependency>
115+
<groupId>org.apache.spark</groupId>
116+
<artifactId>spark-hive_${scala.binary.version}</artifactId>
117+
<version>${project.version}</version>
118+
<scope>test</scope>
119+
</dependency>
120+
<!-- This dependency is installed using ./dev/install-redshift-jdbc.sh -->
121+
<dependency>
122+
<groupId>com.amazonaws</groupId>
123+
<artifactId>redshift.jdbc4</artifactId>
124+
<version>1.1.7.1007</version>
125+
<type>jar</type>
126+
<scope>test</scope>
127+
</dependency>
128+
<dependency>
129+
<groupId>org.mockito</groupId>
130+
<artifactId>mockito-core</artifactId>
131+
<scope>test</scope>
132+
</dependency>
133+
<dependency>
134+
<groupId>org.apache.spark</groupId>
135+
<artifactId>spark-tags_${scala.binary.version}</artifactId>
136+
<type>test-jar</type>
137+
<scope>test</scope>
138+
</dependency>
139+
</dependencies>
140+
<build>
141+
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
142+
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
143+
</build>
144+
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright (C) 2016 Databricks, Inc.
3+
*
4+
* Portions of this software incorporate or are derived from software contained within Apache Spark,
5+
* and this modified software differs from the Apache Spark software provided under the Apache
6+
* License, Version 2.0, a copy of which you may obtain at
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*/
9+
10+
package com.databricks.spark.redshift
11+
12+
import java.net.URI
13+
14+
import org.apache.spark.SparkContext
15+
import org.apache.spark.sql.Row
16+
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
17+
import org.apache.spark.tags.ExtendedRedshiftTest
18+
19+
/**
20+
* This suite performs basic integration tests where the AWS credentials have been
21+
* encoded into the tempdir URI rather than being set in the Hadoop configuration.
22+
*/
23+
@ExtendedRedshiftTest
24+
class AWSCredentialsInUriIntegrationSuite extends IntegrationSuiteBase {
25+
26+
override protected val tempDir: String = {
27+
val uri = new URI(AWS_S3_SCRATCH_SPACE + randomSuffix + "/")
28+
new URI(
29+
uri.getScheme,
30+
s"$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY",
31+
uri.getHost,
32+
uri.getPort,
33+
uri.getPath,
34+
uri.getQuery,
35+
uri.getFragment).toString
36+
}
37+
38+
39+
// Override this method so that we do not set the credentials in sc.hadoopConf.
40+
override def beforeAll(): Unit = {
41+
assert(tempDir.contains("AKIA"), "tempdir did not contain AWS credentials")
42+
assert(!AWS_SECRET_ACCESS_KEY.contains("/"), "AWS secret key should not contain slash")
43+
sc = new SparkContext("local", getClass.getSimpleName)
44+
conn = DefaultJDBCWrapper.getConnector(None, jdbcUrl, None)
45+
}
46+
47+
test("roundtrip save and load") {
48+
val df = sqlContext.createDataFrame(sc.parallelize(Seq(Row(1)), 1),
49+
StructType(StructField("foo", IntegerType) :: Nil))
50+
testRoundtripSaveAndLoad(s"roundtrip_save_and_load_$randomSuffix", df)
51+
}
52+
}

0 commit comments

Comments
 (0)