Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat][Spark] Split datasources and core, prepare for support of multiple spark versions #369

Merged
merged 19 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ jobs:

- name: Generate Doc
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd docs
make html
popd
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/spark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,21 @@ jobs:
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn spotless:check
mvn --no-transfer-progress spotless:check
popd

- name: Build GraphAr Spark
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn clean package -DskipTests -Dspotless.check.skip=true
mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true
popd

- name: Run test
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn test -Dspotless.check.skip=true
mvn --no-transfer-progress test -Dspotless.check.skip=true
popd

- name: Run Neo4j2GraphAr example
Expand Down Expand Up @@ -135,4 +135,4 @@ jobs:
./neo4j.sh neo4j.json

# stop and clean
popd
popd
8 changes: 4 additions & 4 deletions .licenserc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ header:
- 'LICENSE'
- 'NOTICE'
- 'testing'
- 'spark/src/test/resources'
- 'spark/graphar/src/test/resources'
- 'java/src/test/resources'
- '.licenserc.yaml'
- '.gitignore'
Expand All @@ -33,10 +33,10 @@ header:
- 'pre-commit-config.yaml'
- 'docs'
- '**/.gitignore'
- 'spark/.scalafix.conf'
- 'spark/.scalafmt.conf'
- '**/.scalafix.conf'
- '**/.scalafmt.conf'
- 'cpp/apidoc'
- 'spark/src/main/scala/com/alibaba/graphar/datasources'
- 'spark/datasources-32/src/main/scala/com/alibaba/graphar/datasources'
- '*.md'
- '*.rst'
- '**/*.json'
Expand Down
11 changes: 6 additions & 5 deletions docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,16 @@ cpp-apidoc:
.PHONY: spark-apidoc
spark-apidoc:
cd $(ROOTDIR)/spark && \
mvn scala:doc
mvn --no-transfer-progress clean install -DskipTests -Dspotless.check.skip=true && \
mvn --no-transfer-progress scala:doc

.PHONY: html
html: cpp-apidoc spark-apidoc
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
rm -fr $(BUILDDIR)/html/spark/reference
cp -fr $(ROOTDIR)/spark/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cp -fr $(ROOTDIR)/spark/graphar/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cd $(ROOTDIR)/java && \
mvn -P javadoc javadoc:aggregate \
mvn --no-transfer-progress -P javadoc javadoc:aggregate \
-Dmaven.antrun.skip=true \
-DskipTests \
-Djavadoc.output.directory=$(ROOTDIR)/docs/$(BUILDDIR)/html/java/ \
Expand All @@ -64,9 +65,9 @@ html-poetry:
cd $(ROOTDIR)/pyspark && \
poetry run bash -c "cd $(ROOTDIR)/docs && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html"
rm -fr $(BUILDDIR)/html/spark/reference
cp -fr $(ROOTDIR)/spark/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cp -fr $(ROOTDIR)/spark/graphar/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cd $(ROOTDIR)/java && \
mvn -P javadoc javadoc:aggregate \
mvn --no-transfer-progress -P javadoc javadoc:aggregate \
-Dmaven.antrun.skip=true \
-DskipTests \
-Djavadoc.output.directory=$(ROOTDIR)/docs/$(BUILDDIR)/html/java/ \
Expand Down
2 changes: 1 addition & 1 deletion pyspark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
.PHONY: install_test
install_test:
export JAVA_HOME=${JAVA_HOME_11_X64}
cd ../spark && mvn clean package -DskipTests -Dspotless.check.skip=true && cd ../pyspark
cd ../spark && mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true && cd ../pyspark
export PYSPARK_HADOOP_VERSION=3.2
poetry install --with=spark,tests

Expand Down
2 changes: 1 addition & 1 deletion pyspark/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest
from pyspark.sql import SparkSession

JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("target")
JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("graphar").joinpath("target")
GRAPHAR_SHADED_JAR_PATH = None

for jar_file in JARS_PATH.glob("*.jar"):
Expand Down
1 change: 1 addition & 0 deletions spark/datasources-32/.scalafmt.conf
188 changes: 188 additions & 0 deletions spark/datasources-32/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2022-2023 Alibaba Group Holding Limited.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.alibaba</groupId>
<artifactId>graphar</artifactId>
<version>${graphar.version}</version>
</parent>

<groupId>com.alibaba</groupId>
<artifactId>graphar-datasources</artifactId>
<version>${graphar.version}</version>
<packaging>jar</packaging>

<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
<jvmArgs>
<jvmArg>-Xss4096K</jvmArg>
</jvmArgs>
</configuration>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.8.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<jvmArgs>
<jvmArg>-Xms64m</jvmArg>
<jvmArg>-Xmx1024m</jvmArg>
</jvmArgs>
<args>
<arg>-Ywarn-unused</arg>
</args>
<compilerPlugins>
<compilerPlugin>
<groupId>org.scalameta</groupId>
<artifactId>semanticdb-scalac_2.12.10</artifactId>
<version>4.3.24</version>
</compilerPlugin>
</compilerPlugins>
</configuration>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.20.0</version>
<configuration>
<!-- define a language-specific format -->
<java>
<!-- no need to specify files, inferred automatically, but you can if you want -->
<!-- apply a specific flavor of google-java-format and reflow long strings -->
<googleJavaFormat>
<version>1.13.0</version>
<style>AOSP</style>
</googleJavaFormat>
</java>
<scala>
<scalafmt>
<file>${project.basedir}/.scalafmt.conf</file> <!-- optional -->
</scalafmt>
</scala>
</configuration>
</plugin>
<plugin>
<groupId>io.github.evis</groupId>
<artifactId>scalafix-maven-plugin_2.13</artifactId>
<version>0.1.8_0.11.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.alibaba.graphar.datasources

import scala.collection.JavaConverters._
import scala.util.matching.Regex
import java.util

import com.fasterxml.jackson.databind.ObjectMapper
Expand All @@ -34,14 +35,29 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.connector.expressions.Transform

import com.alibaba.graphar.util.Utils

object GarUtils

/**
* GarDataSource is a class to provide gar files as the data source for spark.
*/
class GarDataSource extends TableProvider with DataSourceRegister {
private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"

/**
* Redact the sensitive information in the given string.
*/
// Copy of redact from graphar Utils
private def redact(regex: Option[Regex], text: String): String = {
regex match {
case None => text
case Some(r) =>
if (text == null || text.isEmpty) {
text
} else {
r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
}
}
}

/** The default fallback file format is Parquet. */
def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat]
Expand Down Expand Up @@ -80,7 +96,7 @@ class GarDataSource extends TableProvider with DataSourceRegister {
val name = shortName() + " " + paths
.map(qualifiedPathName(_, hadoopConf))
.mkString(",")
Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
}

private def qualifiedPathName(
Expand Down
1 change: 1 addition & 0 deletions spark/graphar/.scalafmt.conf
Loading
Loading