Skip to content

Commit c470ef1

Browse files
Merge pull request #16 from StabRise/update-4.0
Added notebook for spark 4
2 parents b1aa14a + dbff9ea commit c470ef1

File tree

3 files changed

+903
-12
lines changed

3 files changed

+903
-12
lines changed

README.md

+8-4
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@
4141

4242
**Source Code**: [https://github.com/StabRise/spark-pdf](https://github.com/StabRise/spark-pdf)
4343

44-
**Quick Start Jupyter Notebook**: [PdfDataSource.ipynb](https://github.com/StabRise/spark-pdf/blob/main/examples/PdfDataSource.ipynb)
44+
**Quick Start Jupyter Notebook Spark 3.x.x**: [PdfDataSource.ipynb](https://github.com/StabRise/spark-pdf/blob/main/examples/PdfDataSource.ipynb)
45+
46+
**Quick Start Jupyter Notebook Spark 4.0.x**: [PdfDataSourceSpark4.ipynb](https://github.com/StabRise/spark-pdf/blob/main/examples/PdfDataSourceSpark4.ipynb)
4547

4648
---
4749

@@ -61,10 +63,12 @@ If you found useful this project, please give a star to the repository.
6163

6264
## Requirements
6365

64-
- Java 8, 11
65-
- Apache Spark 3.3.2, 3.4.1, 3.5.0
66+
- Java 8, 11, 17
67+
- Apache Spark 3.3.2, 3.4.1, 3.5.0, 4.0.0
6668
- Ghostscript 9.50 or later (only for the GhostScript reader)
6769

70+
Spark 4.0.0 is supported in the version `0.1.11` and later (need Java 17 and Scala 2.13).
71+
6872
## Installation
6973

7074
Binary package is available in the Maven Central Repository.
@@ -73,7 +77,7 @@ Binary package is available in the Maven Central Repository.
7377
- **Spark 3.5.***: com.stabrise:spark-pdf-spark35_2.12:0.1.11
7478
- **Spark 3.4.***: com.stabrise:spark-pdf-spark34_2.12:0.1.11
7579
- **Spark 3.3.***: com.stabrise:spark-pdf-spark33_2.12:0.1.11
76-
80+
- **Spark 4.0.***: com.stabrise:spark-pdf-spark34_2.13:0.1.11
7781

7882
## Options for the data source:
7983

build.sbt

+12-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import xerial.sbt.Sonatype.GitHubHosting
33

44
ThisBuild / version := "0.1.11"
55

6-
ThisBuild / scalaVersion := scala.util.Properties.envOrElse("SCALA_VERSION", "2.12.15") // "2.13.14"
6+
ThisBuild / scalaVersion := scala.util.Properties.envOrElse("SCALA_VERSION", "2.12.15") // "2.13.14", "2.12.15"
77
ThisBuild / organization := "com.stabrise"
88
ThisBuild / organizationName := "StabRise"
99
ThisBuild / organizationHomepage := Some(url("https://www.stabrise.com"))
@@ -16,18 +16,18 @@ ThisBuild / scmInfo := Some(
1616
)
1717
)
1818

19-
ThisBuild / sonatypeProjectHosting := Some(GitHubHosting("StabRise", "spark-pdf", "kolia1985@gmail.com"))
19+
ThisBuild / sonatypeProjectHosting := Some(GitHubHosting("StabRise", "spark-pdf", "mykola.melnyk.ml@gmail.com"))
2020
ThisBuild / versionScheme := Some("early-semver")
2121
ThisBuild / developers := List(
2222
Developer(
23-
id = "kolia1985",
23+
id = "mykolamelnykml",
2424
name = "Mykola Melnyk",
25-
email = "kolia1985@gmail.com",
25+
email = "mykola.melnyk.ml@gmail.com",
2626
url = url("https://stabrise.com")
2727
)
2828
)
2929

30-
ThisBuild / description := "PDF Datasource for Apache Spark. Read PDF files to the DataFrame."
30+
ThisBuild / description := "PDF Datasource for Apache Spark. Read PDF files lazy to the DataFrame."
3131
ThisBuild / licenses := List("AGPL-V3" -> new URL("https://www.gnu.org/licenses/agpl-3.0.html"))
3232
ThisBuild / homepage := Some(url("https://stabrise.com/spark-pdf/"))
3333
ThisBuild / sonatypeCredentialHost := sonatypeCentralHost
@@ -37,7 +37,8 @@ ThisBuild / publishTo := sonatypePublishToBundle.value
3737
root / Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.ScalaLibrary
3838
root / Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat
3939

40-
val sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", "3.5.3") // "4.0.0-preview2", "3.4.1", "3.3.2
40+
// "4.0.0-preview2", "3.5.3","3.4.1", "3.3.2
41+
val sparkVersion = scala.util.Properties.envOrElse("SPARK_VERSION", "3.5.3")
4142

4243
val packageName =
4344
sparkVersion match {
@@ -145,7 +146,8 @@ lazy val assemblySettings = Seq(
145146
case PathList(ps @ _*) if ps.filter(_.contains ( "macos")).nonEmpty => MergeStrategy.discard
146147
case PathList(ps @ _*) if ps.filter(_.contains ( "windows")).nonEmpty => MergeStrategy.discard
147148
case PathList(ps @ _*) if ps.filter(_.contains ( "ios")).nonEmpty => MergeStrategy.discard
148-
case PathList(ps @ _*) if ps.filter(p => p.contains("linux-arm") || p.contains("arm64-v8a") || p.contains("armeabi") ).nonEmpty => MergeStrategy.discard
149+
case PathList(ps @ _*) if ps.filter(p => p.contains("linux-arm") || p.contains("arm64-v8a") ||
150+
p.contains("armeabi") ).nonEmpty => MergeStrategy.discard
149151
case PathList(ps @ _*) if ps.filter(_.contains("linux-ppc")).nonEmpty => MergeStrategy.discard
150152
case PathList(ps @ _*) if ps.filter(_.contentEquals("linux-x86")).nonEmpty => MergeStrategy.discard
151153
case PathList(ps @ _*) if ps.filter(_.contentEquals("windows-x86")).nonEmpty => MergeStrategy.discard
@@ -155,7 +157,9 @@ lazy val assemblySettings = Seq(
155157
case "StaticLoggerBinder" => MergeStrategy.discard
156158
case PathList("net", "imglib2", "util", "StopWatch.class") => MergeStrategy.first
157159
case PathList("META-INF", fileName)
158-
if List("NOTICE", "MANIFEST.MF", "DEPENDENCIES", "INDEX.LIST").contains(fileName) || fileName.endsWith(".txt") || fileName.endsWith(".RSA") || fileName.endsWith(".DSA") || fileName.endsWith(".SF")
160+
if List("NOTICE", "MANIFEST.MF", "DEPENDENCIES", "INDEX.LIST").contains(fileName) ||
161+
fileName.endsWith(".txt") || fileName.endsWith(".RSA") ||
162+
fileName.endsWith(".DSA") || fileName.endsWith(".SF")
159163
=> MergeStrategy.discard
160164
case "META-INF/services/javax.imageio.spi.ImageReaderSpi" => MergeStrategy.concat
161165
case PathList("META-INF", "services", _@_*) => MergeStrategy.first

0 commit comments

Comments
 (0)