Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat][Spark] Split datasources and core, prepare for support of multiple spark versions #369

Merged
merged 19 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .licenserc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ header:
- 'LICENSE'
- 'NOTICE'
- 'testing'
- 'spark/src/test/resources'
- 'spark/graphar/src/test/resources'
- 'java/src/test/resources'
- '.licenserc.yaml'
- '.gitignore'
Expand All @@ -33,10 +33,10 @@ header:
- 'pre-commit-config.yaml'
- 'docs'
- '**/.gitignore'
- 'spark/.scalafix.conf'
- 'spark/.scalafmt.conf'
- '**/.scalafix.conf'
- '**/.scalafmt.conf'
- 'cpp/apidoc'
- 'spark/src/main/scala/com/alibaba/graphar/datasources'
- 'spark/datasources-*/src/main/scala/com/alibaba/graphar/datasources'
- '*.md'
- '*.rst'
- '**/*.json'
Expand Down
2 changes: 1 addition & 1 deletion pyspark/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest
from pyspark.sql import SparkSession

JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("target")
JARS_PATH = Path(__file__).parent.parent.parent.joinpath("spark").joinpath("graphar").joinpath("target")
GRAPHAR_SHADED_JAR_PATH = None

for jar_file in JARS_PATH.glob("*.jar"):
Expand Down
1 change: 1 addition & 0 deletions spark/datasources-32/.scalafmt.conf
188 changes: 188 additions & 0 deletions spark/datasources-32/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2022-2023 Alibaba Group Holding Limited.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.alibaba</groupId>
<artifactId>graphar</artifactId>
<version>${graphar.version}</version>
</parent>

<groupId>com.alibaba</groupId>
<artifactId>graphar-datasources</artifactId>
<version>${graphar.version}</version>
<packaging>jar</packaging>

<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
<jvmArgs>
<jvmArg>-Xss4096K</jvmArg>
</jvmArgs>
</configuration>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.8.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<jvmArgs>
<jvmArg>-Xms64m</jvmArg>
<jvmArg>-Xmx1024m</jvmArg>
</jvmArgs>
<args>
<arg>-Ywarn-unused</arg>
</args>
<compilerPlugins>
<compilerPlugin>
<groupId>org.scalameta</groupId>
<artifactId>semanticdb-scalac_2.12.10</artifactId>
<version>4.3.24</version>
</compilerPlugin>
</compilerPlugins>
</configuration>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.20.0</version>
<configuration>
<!-- define a language-specific format -->
<java>
<!-- no need to specify files, inferred automatically, but you can if you want -->
<!-- apply a specific flavor of google-java-format and reflow long strings -->
<googleJavaFormat>
<version>1.13.0</version>
<style>AOSP</style>
</googleJavaFormat>
</java>
<scala>
<scalafmt>
<file>${project.basedir}/.scalafmt.conf</file> <!-- optional -->
</scalafmt>
</scala>
</configuration>
</plugin>
<plugin>
<groupId>io.github.evis</groupId>
<artifactId>scalafix-maven-plugin_2.13</artifactId>
<version>0.1.8_0.11.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
</plugins>
</build>
</project>
SemyonSinchenko marked this conversation as resolved.
Show resolved Hide resolved
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.alibaba.graphar.datasources

import scala.collection.JavaConverters._
import scala.util.matching.Regex
import java.util

import com.fasterxml.jackson.databind.ObjectMapper
Expand All @@ -34,14 +35,29 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.connector.expressions.Transform

import com.alibaba.graphar.util.Utils

object GarUtils

/**
* GarDataSource is a class to provide gar files as the data source for spark.
*/
class GarDataSource extends TableProvider with DataSourceRegister {
private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"

/**
* Redact the sensitive information in the given string.
*/
// Copy of redact from graphar Utils
private def redact(regex: Option[Regex], text: String): String = {
regex match {
case None => text
case Some(r) =>
if (text == null || text.isEmpty) {
text
} else {
r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
}
}
}

/** The default fallback file format is Parquet. */
def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat]
Expand Down Expand Up @@ -80,7 +96,7 @@ class GarDataSource extends TableProvider with DataSourceRegister {
val name = shortName() + " " + paths
.map(qualifiedPathName(_, hadoopConf))
.mkString(",")
Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
}

private def qualifiedPathName(
Expand Down
1 change: 1 addition & 0 deletions spark/graphar/.scalafmt.conf
Loading
Loading