diff --git a/docs/config.toml b/docs/config.toml index 25cf3beb3..40343db31 100644 --- a/docs/config.toml +++ b/docs/config.toml @@ -30,12 +30,8 @@ home = [ "HTML", "RSS", "SearchIndex" ] { name = "0.12.1", pre = "relative", url = "../0.12.1", weight = 1000 } ] topnav = [ - { name = "Docs", url = "/docs/latest", weight = 100 }, - { name = "Releases", pre = "relative", url = "../../releases", weight = 600 }, - { name = "Spark", url = "/docs/latest/getting-started", weight = 200 }, - { name = "Flink", url = "/docs/latest/flink", weight = 300 }, - { name = "Trino", url = "https://trino.io/docs/current/connector/iceberg.html", weight = 400 }, - { name = "Presto", url = "https://prestodb.io/docs/current/connector/iceberg.html" , weight = 500 }, + { name = "Quickstart", url = "/spark-quickstart", weight = 100 }, + { name = "Docs", url = "/docs/latest", weight = 200 }, { name = "Blogs", pre = "relative", url = "../../blogs", weight = 998 }, { name = "Talks", pre = "relative", url = "../../talks", weight = 999 }, { name = "Roadmap", pre = "relative", url = "../../roadmap", weight = 997 }, diff --git a/iceberg-theme/layouts/partials/body.html b/iceberg-theme/layouts/partials/body.html index 5d04d7b46..fe4cc01d8 100644 --- a/iceberg-theme/layouts/partials/body.html +++ b/iceberg-theme/layouts/partials/body.html @@ -16,12 +16,12 @@ {{ partial "header.html" . }}
-
+
{{ if not .Params.disableSidebar }} {{ partial "sidebar.html" . }} {{ end }}
-
+
{{- .Content -}}
{{ if not .Params.disableToc }} diff --git a/iceberg-theme/static/css/iceberg-theme.css b/iceberg-theme/static/css/iceberg-theme.css index 4d20b62ee..30237946c 100644 --- a/iceberg-theme/static/css/iceberg-theme.css +++ b/iceberg-theme/static/css/iceberg-theme.css @@ -243,6 +243,15 @@ h4:hover a { visibility: visible} width: 100%; } +/* Cards at the top of each quickstart page */ +.quickstart-container { + display: flex; + flex-wrap: wrap; + width: 80vw; + grid-template-columns: 1fr 1fr 1fr; + grid-gap: 1rem; +} + .content-only { grid-template-columns: auto; } @@ -263,6 +272,10 @@ h4:hover a { visibility: visible} margin-right: 40%; } +.margin-without-toc { + margin-right: 20%; +} + #toc { position: fixed; right: 0; @@ -637,4 +650,67 @@ div#full ul.sub-menu { .versions-dropdown:hover .versions-dropdown-content { display: block; +} + +.quickstart-card { +color: #fff; +width:300px; +position: relative; +-webkit-box-shadow: 0px 5px 5px 0px rgba(0,0,0,0.3); +-moz-box-shadow: 0px 5px 5px 0px rgba(0,0,0,0.3); +-o-box-shadow: 0px 5px 5px 0px rgba(0,0,0,0.3); +box-shadow: 0px 5px 5px 0px rgba(0,0,0,0.3); +background-image: linear-gradient(to bottom right, #2879be, #57A7D8); +} + +.quickstart-card .quickstart-card-content { + padding: 30px; + height: 250px; +} + +.quickstart-card .quickstart-card-title { + font-size: 25px; + font-family: 'Open Sans', sans-serif; +} + +.quickstart-card .quickstart-card-text { +line-height: 1.6; +} + +.quickstart-card .quickstart-card-link { +padding: 25px; +width: -webkit-fill-available; +border-top:1px solid #82c1bb; +} + +.quickstart-card-link a { + text-decoration: none; + position: relative; + padding: 10px 0px; + color: #fff; +} + +.quickstart-card .quickstart-card-link a:after { + top: 30px; + content: ""; + display: block; + height: 2px; + left: 50%; + position: absolute; + width: 0; + background:#fff; + + -webkit-transition: width 0.3s ease 0s, left 0.3s ease 0s; + -moz-transition: width 0.3s ease 0s, left 0.3s ease 0s; + -o-transition: width 0.3s ease 0s, left 0.3s ease 0s; + transition: width 0.3s ease 0s, left 0.3s ease 0s; +} + +.quickstart-card .quickstart-card-link a:hover:after { + width: 100%; + left: 0; +} + +.quickstart-card .quickstart-card-link a:after { + background:#fff; } \ No newline at end of file diff --git a/landing-page/config.toml b/landing-page/config.toml index e2296eb12..4ae237700 100644 --- a/landing-page/config.toml +++ b/landing-page/config.toml @@ -40,12 +40,9 @@ home = [ "HTML", "RSS", "SearchIndex" ] { name = "0.12.1", url = "/docs/0.12.1", weight = 1000 } ] topnav = [ - { name = "Docs", url = "/docs/latest", weight = 100 }, + { name = "Quickstart", url = "/spark-quickstart", weight = 100 }, + { name = "Docs", url = "/docs/latest", weight = 200 }, { name = "Releases", url = "/releases", weight = 600 }, - { name = "Spark", url = "/docs/latest/getting-started", weight = 200 }, - { name = "Flink", url = "/docs/latest/flink", weight = 300 }, - { name = "Trino", identifier = "_trino", url = "https://trino.io/docs/current/connector/iceberg.html", weight = 400 }, - { name = "Presto", identifier = "_presto", url = "https://prestodb.io/docs/current/connector/iceberg.html", weight = 500 }, { name = "Blogs", url = "/blogs", weight = 998 }, { name = "Talks", url = "/talks", weight = 999 }, { name = "Roadmap", url = "/roadmap", weight = 997 }, @@ -63,6 +60,9 @@ home = [ "HTML", "RSS", "SearchIndex" ] { name = "Donate", identifier = "_donate", parent = "ASF", url = "https://www.apache.org/foundation/sponsorship.html" }, { name = "Events", identifier = "_events", parent = "ASF", url = "https://www.apache.org/events/current-event.html" }, ] + quickstarts = [ + { name = "Spark and Iceberg Quickstart", weight = 100, url = "spark-quickstart", post = "This quickstart will get you up and running with an Iceberg and Spark environment, including sample notebooks." } + ] [markup.goldmark.renderer] unsafe= true \ No newline at end of file diff --git a/landing-page/content/common/spark-quickstart.md b/landing-page/content/common/spark-quickstart.md new file mode 100644 index 000000000..27b389ec4 --- /dev/null +++ b/landing-page/content/common/spark-quickstart.md @@ -0,0 +1,323 @@ +--- +title: "Spark and Iceberg Quickstart" +weight: 100 +url: spark-quickstart +aliases: + - "quickstart" + - "quickstarts" + - "getting-started" +disableSidebar: true +disableToc: true +--- + + +{{% quickstarts %}} + +## Spark and Iceberg Quickstart + +This guide will get you up and running with an Iceberg and Spark environment, including sample code to +highlight some powerful features. You can learn more about Iceberg's Spark runtime by checking out the [Spark](../docs/latest/spark-ddl/) section. + +- [Docker-Compose](#docker-compose) +- [Creating a table](#creating-a-table) +- [Writing Data to a Table](#writing-data-to-a-table) +- [Reading Data from a Table](#reading-data-from-a-table) +- [Adding Iceberg to Spark](#adding-iceberg-to-spark) +- [Adding A Catalog](#adding-a-catalog) +- [Next Steps](#next-steps) + +### Docker-Compose + +The fastest way to get started is to use a docker-compose file that uses the the [tabulario/spark-iceberg](https://hub.docker.com/r/tabulario/spark-iceberg) image +which contains a local Spark cluster with a configured Iceberg catalog. To use this, you'll need to install the [Docker CLI](https://docs.docker.com/get-docker/) as well as the [Docker Compose CLI](https://github.com/docker/compose-cli/blob/main/INSTALL.md). + +Once you have those, save the yaml below into a file named `docker-compose.yml`: + +```yaml +version: "3" + +services: + spark-iceberg: + image: tabulario/spark-iceberg + depends_on: + - postgres + container_name: spark-iceberg + environment: + - SPARK_HOME=/opt/spark + - PYSPARK_PYTON=/usr/bin/python3.9 + - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + ports: + - 8888:8888 + - 8080:8080 + - 18080:18080 + postgres: + image: postgres:13.4-bullseye + container_name: postgres + environment: + - POSTGRES_USER=admin + - POSTGRES_PASSWORD=password + - POSTGRES_DB=demo_catalog + volumes: + - ./postgres/data:/var/lib/postgresql/data +``` + +Next, start up the docker containers with this command: +```sh +docker-compose up +``` + +You can then run any of the following commands to start a Spark session. + +{{% codetabs "LaunchSparkClient" %}} +{{% addtab "SparkSQL" checked %}} +{{% addtab "SparkShell" %}} +{{% addtab "PySpark" %}} +{{% tabcontent "SparkSQL" %}} +```sh +docker exec -it spark-iceberg spark-sql +``` +{{% /tabcontent %}} +{{% tabcontent "SparkShell" %}} +```sh +docker exec -it spark-iceberg spark-shell +``` +{{% /tabcontent %}} +{{% tabcontent "PySpark" %}} +```sh +docker exec -it spark-iceberg pyspark +``` +{{% /tabcontent %}} +{{% /codetabs %}} +{{< hint info >}} +You can also launch a notebook server by running `docker exec -it spark-iceberg notebook`. +The notebook server will be available at [http://localhost:8888](http://localhost:8888) +{{< /hint >}} + +### Creating a table + +To create your first Iceberg table in Spark, run a [`CREATE TABLE`](../spark-ddl#create-table) command. Let's create a table +using `demo.nyc.taxis` where `demo` is the catalog name, `nyc` is the database name, and `taxis` is the table name. + + +{{% codetabs "CreateATable" %}} +{{% addtab "SparkSQL" checked %}} +{{% addtab "SparkShell" %}} +{{% addtab "PySpark" %}} +{{% tabcontent "SparkSQL" %}} +```sql +CREATE TABLE demo.nyc.taxis +( + vendor_id bigint, + trip_id bigint, + trip_distance float, + fare_amount double, + store_and_fwd_flag string +) +PARTITIONED BY (vendor_id); +``` +{{% /tabcontent %}} +{{% tabcontent "SparkShell" %}} +```scala +import org.apache.spark.sql.types._ +import org.apache.spark.sql.Row +val schema = StructType( Array( + StructField("vendor_id", LongType,true), + StructField("trip_id", LongType,true), + StructField("trip_distance", FloatType,true), + StructField("fare_amount", DoubleType,true), + StructField("store_and_fwd_flag", StringType,true) +)) +val df = spark.createDataFrame(spark.sparkContext.emptyRDD[Row],schema) +df.writeTo("demo.nyc.taxis").create() +``` +{{% /tabcontent %}} +{{% tabcontent "PySpark" %}} +```py +from pyspark.sql.types import DoubleType, FloatType, LongType, StructType,StructField, StringType +schema = StructType([ + StructField("vendor_id", LongType(), True), + StructField("trip_id", LongType(), True), + StructField("trip_distance", FloatType(), True), + StructField("fare_amount', DoubleType(), True), + StructField("store_and_fwd_flag', StringType(), True) +]) + +df = spark.createDataFrame([], schema) +df.writeTo("demo.nyc.taxis").create() +``` +{{% /tabcontent %}} +{{% /codetabs %}} + +Iceberg catalogs support the full range of SQL DDL commands, including: + +* [`CREATE TABLE ... PARTITIONED BY`](../spark-ddl#create-table) +* [`CREATE TABLE ... AS SELECT`](../spark-ddl#create-table--as-select) +* [`ALTER TABLE`](../spark-ddl#alter-table) +* [`DROP TABLE`](../spark-ddl#drop-table) + +### Writing Data to a Table + +Once your table is created, you can insert records. + +{{% codetabs "InsertData" %}} +{{% addtab "SparkSQL" checked %}} +{{% addtab "SparkShell" %}} +{{% addtab "PySpark" %}} +{{% tabcontent "SparkSQL" %}} +```sql +INSERT INTO demo.nyc.taxis +VALUES (1, 1000371, 1.8, 15.32, 'N'), (2, 1000372, 2.5, 22.15, 'N'), (2, 1000373, 0.9, 9.01, 'N'), (1, 1000374, 8.4, 42.13, 'Y'); +``` +{{% /tabcontent %}} +{{% tabcontent "SparkShell" %}} +```scala +import org.apache.spark.sql.Row + +val schema = spark.table("demo.nyc.taxis").schema +val data = Seq( + Row(1: Long, 1000371: Long, 1.8f: Float, 15.32: Double, "N": String), + Row(2: Long, 1000372: Long, 2.5f: Float, 22.15: Double, "N": String), + Row(2: Long, 1000373: Long, 0.9f: Float, 9.01: Double, "N": String), + Row(1: Long, 1000374: Long, 8.4f: Float, 42.13: Double, "Y": String) +) +val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) +df.writeTo("demo.nyc.taxis").append() +``` +{{% /tabcontent %}} +{{% tabcontent "PySpark" %}} +```py +schema = spark.table("demo.nyc.taxis").schema +data = [ + (1, 1000371, 1.8, 15.32, "N"), + (2, 1000372, 2.5, 22.15, "N"), + (2, 1000373, 0.9, 9.01, "N"), + (1, 1000374, 8.4, 42.13, "Y") + ] +df = spark.createDataFrame(data, schema) +df.writeTo("demo.nyc.taxis").append() +``` +{{% /tabcontent %}} +{{% /codetabs %}} + +### Reading Data from a Table + +To read a table, simply use the Iceberg table's name. + +{{% codetabs "SelectData" %}} +{{% addtab "SparkSQL" checked %}} +{{% addtab "SparkShell" %}} +{{% addtab "PySpark" %}} +{{% tabcontent "SparkSQL" %}} +```sql +SELECT * FROM demo.nyc.taxis; +``` +{{% /tabcontent %}} +{{% tabcontent "SparkShell" %}} +```scala +val df = spark.table("demo.nyc.taxis").show() +``` +{{% /tabcontent %}} +{{% tabcontent "PySpark" %}} +```py +df = spark.table("demo.nyc.taxis").show() +``` +{{% /tabcontent %}} +{{% /codetabs %}} + +### Adding Iceberg to Spark + +To add Iceberg, use the `--packages` option. + +{{% codetabs "AddIcebergToSpark" %}} +{{% addtab "SparkSQL" checked %}} +{{% addtab "SparkShell" %}} +{{% addtab "PySpark" %}} +{{% tabcontent "SparkSQL" %}} +```sh +spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{% icebergVersion %}} +``` +{{% /tabcontent %}} +{{% tabcontent "SparkShell" %}} +```sh +spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{% icebergVersion %}} +``` +{{% /tabcontent %}} +{{% tabcontent "PySpark" %}} +```sh +pyspark --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{% icebergVersion %}} +``` +{{% /tabcontent %}} +{{% /codetabs %}} + +{{< hint info >}} +If you want to include Iceberg in your Spark installation, add the Iceberg Spark runtime to Spark's `jars` folder. +You can download the runtime by visiting to the [Releases](https://iceberg.apache.org/releases/) page. +{{< /hint >}} + +[spark-runtime-jar]: https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/{{% icebergVersion %}}/iceberg-spark-runtime-3.2_2.12-{{% icebergVersion %}}.jar + +### Adding A Catalog + +Iceberg has several catalog back-ends that can be used to track tables, like JDBC, Hive MetaStore and Glue. +Catalogs are configured using properties under `spark.sql.catalog.(catalog_name)`. In this guide, +we use JDBC, but you can follow these instructions to configure other catalog types. To learn more, check out +the [Catalog](../docs/latest/spark-configuration/#catalogs) page in the Spark section. + +This configuration creates a path-based catalog named `local` for tables under `$PWD/warehouse` and adds support for Iceberg tables to Spark's built-in catalog. + + +{{% codetabs "AddingACatalog" %}} +{{% addtab "CLI" checked %}} +{{% addtab "spark-defaults" %}} +{{% tabcontent "CLI" %}} +```sh +spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{% icebergVersion %}}\ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.demo=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.demo.type=hadoop \ + --conf spark.sql.catalog.demo.warehouse=$PWD/warehouse \ + --conf spark.sql.defaultCatalog=demo +``` +{{% /tabcontent %}} +{{% tabcontent "spark-defaults" %}} +```sh +spark.jars.packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{% icebergVersion %}} +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog +spark.sql.catalog.spark_catalog.type hive +spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.demo.type hadoop +spark.sql.catalog.demo.warehouse $PWD/warehouse +spark.sql.defaultCatalog demo +``` +{{% /tabcontent %}} +{{% /codetabs %}} + + +{{< hint info >}} +If your Iceberg catalog is not set as the default catalog, you will have to switch to it by executing `USE demo;` +{{< /hint >}} + +### Next steps + +Now that you're up an running with Iceberg and Spark, check out the [Iceberg-Spark runtime docs](../docs/latest/spark-ddl/) to learn more! \ No newline at end of file diff --git a/landing-page/layouts/shortcodes/quickstarts.html b/landing-page/layouts/shortcodes/quickstarts.html new file mode 100644 index 000000000..9ee36484d --- /dev/null +++ b/landing-page/layouts/shortcodes/quickstarts.html @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + +
+More Quickstarts +
{{ $currentPageTitle := .Page.Title }}{{ range .Site.Menus.quickstarts }}{{ if ne .Name $currentPageTitle }} +
+
+ {{ .Name }} +

+ {{ substr .Post 0 130 }} +

+
+ +
+
{{ else }}{{ end }}{{ end }} +
+
\ No newline at end of file