Skip to content

Commit

Permalink
[Feat][Spark] Split datasources and core, prepare for support of mult…
Browse files Browse the repository at this point in the history
…iple spark versions (#369)

* Refactoring of the GraphAr Spark

- split datasources and core GraphAr
- introduce Maven profiles for different versions of Spark
- small fixes of PySpark part due to new naming and paths
- new pom.xml files for subprojects
  • Loading branch information
SemyonSinchenko authored Feb 22, 2024
1 parent 4158bb8 commit bdcf367
Show file tree
Hide file tree
Showing 59 changed files with 568 additions and 269 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ jobs:
- name: Generate Doc
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd docs
make html
popd
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/spark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,21 @@ jobs:
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn spotless:check
mvn --no-transfer-progress spotless:check
popd
- name: Build GraphAr Spark
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn clean package -DskipTests -Dspotless.check.skip=true
mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true
popd
- name: Run test
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
pushd spark
mvn test -Dspotless.check.skip=true
mvn --no-transfer-progress test -Dspotless.check.skip=true
popd
- name: Run Neo4j2GraphAr example
Expand Down Expand Up @@ -135,4 +135,4 @@ jobs:
./neo4j.sh neo4j.json
# stop and clean
popd
popd
8 changes: 4 additions & 4 deletions .licenserc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ header:
- 'LICENSE'
- 'NOTICE'
- 'testing'
- 'spark/src/test/resources'
- 'spark/graphar/src/test/resources'
- 'java/src/test/resources'
- '.licenserc.yaml'
- '.gitignore'
Expand All @@ -33,10 +33,10 @@ header:
- 'pre-commit-config.yaml'
- 'docs'
- '**/.gitignore'
- 'spark/.scalafix.conf'
- 'spark/.scalafmt.conf'
- '**/.scalafix.conf'
- '**/.scalafmt.conf'
- 'cpp/apidoc'
- 'spark/src/main/scala/com/alibaba/graphar/datasources'
- 'spark/datasources-32/src/main/scala/com/alibaba/graphar/datasources'
- '*.md'
- '*.rst'
- '**/*.json'
Expand Down
11 changes: 6 additions & 5 deletions docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,16 @@ cpp-apidoc:
.PHONY: spark-apidoc
spark-apidoc:
cd $(ROOTDIR)/spark && \
mvn scala:doc
mvn --no-transfer-progress clean install -DskipTests -Dspotless.check.skip=true && \
mvn --no-transfer-progress scala:doc

.PHONY: html
html: cpp-apidoc spark-apidoc
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
rm -fr $(BUILDDIR)/html/spark/reference
cp -fr $(ROOTDIR)/spark/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cp -fr $(ROOTDIR)/spark/graphar/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cd $(ROOTDIR)/java && \
mvn -P javadoc javadoc:aggregate \
mvn --no-transfer-progress -P javadoc javadoc:aggregate \
-Dmaven.antrun.skip=true \
-DskipTests \
-Djavadoc.output.directory=$(ROOTDIR)/docs/$(BUILDDIR)/html/java/ \
Expand All @@ -64,9 +65,9 @@ html-poetry:
cd $(ROOTDIR)/pyspark && \
poetry run bash -c "cd $(ROOTDIR)/docs && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html"
rm -fr $(BUILDDIR)/html/spark/reference
cp -fr $(ROOTDIR)/spark/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cp -fr $(ROOTDIR)/spark/graphar/target/site/scaladocs $(BUILDDIR)/html/spark/reference/
cd $(ROOTDIR)/java && \
mvn -P javadoc javadoc:aggregate \
mvn --no-transfer-progress -P javadoc javadoc:aggregate \
-Dmaven.antrun.skip=true \
-DskipTests \
-Djavadoc.output.directory=$(ROOTDIR)/docs/$(BUILDDIR)/html/java/ \
Expand Down
2 changes: 1 addition & 1 deletion pyspark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
.PHONY: install_test
install_test:
export JAVA_HOME=${JAVA_HOME_11_X64}
cd ../spark && mvn clean package -DskipTests -Dspotless.check.skip=true && cd ../pyspark
cd ../spark && mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true && cd ../pyspark
export PYSPARK_HADOOP_VERSION=3.2
poetry install --with=spark,tests

Expand Down
2 changes: 1 addition & 1 deletion pyspark/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest
from pyspark.sql import SparkSession

JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("target")
JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("graphar").joinpath("target")
GRAPHAR_SHADED_JAR_PATH = None

for jar_file in JARS_PATH.glob("*.jar"):
Expand Down
1 change: 1 addition & 0 deletions spark/datasources-32/.scalafmt.conf
188 changes: 188 additions & 0 deletions spark/datasources-32/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2022-2023 Alibaba Group Holding Limited.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.alibaba</groupId>
<artifactId>graphar</artifactId>
<version>${graphar.version}</version>
</parent>

<groupId>com.alibaba</groupId>
<artifactId>graphar-datasources</artifactId>
<version>${graphar.version}</version>
<packaging>jar</packaging>

<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
<jvmArgs>
<jvmArg>-Xss4096K</jvmArg>
</jvmArgs>
</configuration>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.8.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<jvmArgs>
<jvmArg>-Xms64m</jvmArg>
<jvmArg>-Xmx1024m</jvmArg>
</jvmArgs>
<args>
<arg>-Ywarn-unused</arg>
</args>
<compilerPlugins>
<compilerPlugin>
<groupId>org.scalameta</groupId>
<artifactId>semanticdb-scalac_2.12.10</artifactId>
<version>4.3.24</version>
</compilerPlugin>
</compilerPlugins>
</configuration>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.20.0</version>
<configuration>
<!-- define a language-specific format -->
<java>
<!-- no need to specify files, inferred automatically, but you can if you want -->
<!-- apply a specific flavor of google-java-format and reflow long strings -->
<googleJavaFormat>
<version>1.13.0</version>
<style>AOSP</style>
</googleJavaFormat>
</java>
<scala>
<scalafmt>
<file>${project.basedir}/.scalafmt.conf</file> <!-- optional -->
</scalafmt>
</scala>
</configuration>
</plugin>
<plugin>
<groupId>io.github.evis</groupId>
<artifactId>scalafix-maven-plugin_2.13</artifactId>
<version>0.1.8_0.11.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.alibaba.graphar.datasources

import scala.collection.JavaConverters._
import scala.util.matching.Regex
import java.util

import com.fasterxml.jackson.databind.ObjectMapper
Expand All @@ -34,14 +35,29 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.connector.expressions.Transform

import com.alibaba.graphar.util.Utils

object GarUtils

/**
* GarDataSource is a class to provide gar files as the data source for spark.
*/
class GarDataSource extends TableProvider with DataSourceRegister {
private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"

/**
* Redact the sensitive information in the given string.
*/
// Copy of redact from graphar Utils
private def redact(regex: Option[Regex], text: String): String = {
regex match {
case None => text
case Some(r) =>
if (text == null || text.isEmpty) {
text
} else {
r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
}
}
}

/** The default fallback file format is Parquet. */
def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat]
Expand Down Expand Up @@ -80,7 +96,7 @@ class GarDataSource extends TableProvider with DataSourceRegister {
val name = shortName() + " " + paths
.map(qualifiedPathName(_, hadoopConf))
.mkString(",")
Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
}

private def qualifiedPathName(
Expand Down
1 change: 1 addition & 0 deletions spark/graphar/.scalafmt.conf
Loading

0 comments on commit bdcf367

Please sign in to comment.