-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24244][SQL] Passing only required columns to the CSV parser #21296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
9cffa0f
fdbcbe3
578f47b
0f942c3
c4b1160
8cf6eab
5b2f0b9
6d1e902
4525795
8809cec
dc97ceb
51b3148
e3958b1
a4a0a54
fa86015
15528d2
f90daa7
4d9873d
7dcfc7a
f89eeb7
6ff6d4f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.spark.sql.execution.datasources.csv | ||
|
|
||
| import java.io.File | ||
|
|
||
| import org.apache.spark.SparkConf | ||
| import org.apache.spark.sql.{Column, Row, SparkSession} | ||
| import org.apache.spark.sql.functions.lit | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.{Benchmark, Utils} | ||
|
|
||
| /** | ||
| * Benchmark to measure CSV read/write performance. | ||
| * To run this: | ||
| * spark-submit --class <this class> --jars <spark sql test jar> | ||
| */ | ||
| object CSVBenchmarks { | ||
| val conf = new SparkConf() | ||
|
|
||
| val spark = SparkSession.builder | ||
| .master("local[1]") | ||
| .appName("benchmark-csv-datasource") | ||
| .config(conf) | ||
| .getOrCreate() | ||
| import spark.implicits._ | ||
|
|
||
| def withTempPath(f: File => Unit): Unit = { | ||
| val path = Utils.createTempDir() | ||
| path.delete() | ||
| try f(path) finally Utils.deleteRecursively(path) | ||
| } | ||
|
|
||
| def multiColumnsBenchmark(rowsNum: Int): Unit = { | ||
| val colsNum = 1000 | ||
| val benchmark = new Benchmark(s"Wide rows with $colsNum columns", rowsNum) | ||
|
|
||
| withTempPath { path => | ||
| val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", IntegerType)) | ||
| val schema = StructType(fields) | ||
| val values = (0 until colsNum).map(i => i.toString).mkString(",") | ||
| val columnNames = schema.fieldNames | ||
|
|
||
| spark.range(rowsNum) | ||
| .select(Seq.tabulate(colsNum)(i => lit(i).as(s"col$i")): _*) | ||
| .write.option("header", true) | ||
| .csv(path.getAbsolutePath) | ||
|
|
||
| val ds = spark.read.schema(schema).csv(path.getAbsolutePath) | ||
|
|
||
| benchmark.addCase(s"Select $colsNum columns", 3) { _ => | ||
| ds.select("*").filter((row: Row) => true).count() | ||
| } | ||
| val cols100 = columnNames.take(100).map(Column(_)) | ||
| benchmark.addCase(s"Select 100 columns", 3) { _ => | ||
| ds.select(cols100: _*).filter((row: Row) => true).count() | ||
| } | ||
| benchmark.addCase(s"Select one column", 3) { _ => | ||
| ds.select($"col1").filter((row: Row) => true).count() | ||
| } | ||
|
|
||
| /* | ||
| Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz | ||
| Wide rows with 1000 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative | ||
| -------------------------------------------------------------------------------------------- | ||
| Select 1000 columns 76910 / 78065 0.0 76909.8 1.0X | ||
| Select 100 columns 28625 / 32884 0.0 28625.1 2.7X | ||
| Select one column 22498 / 22669 0.0 22497.8 3.4X | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, I think we are already doing the column pruning by avoiding casting cost which is relatively expensive comparing to the parsing logic.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right, avoiding unnecessary casting speeds up more than 2 times. We can see that on this benchmark before my changes. Without the changes, selecting only one string column takes 44.5 seconds but select of all columns ~80 seconds.
As the benchmark shows we can achieve performance improvements in parsing too. Selecting only 1 out of 1000 columns takes 22.5 seconds but without the PR it takes 44.5: |
||
| */ | ||
| benchmark.run() | ||
| } | ||
| } | ||
|
|
||
| def main(args: Array[String]): Unit = { | ||
| multiColumnsBenchmark(rowsNum = 1000 * 1000) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,7 +267,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te | |
| .options(Map("header" -> "true", "mode" -> "dropmalformed")) | ||
| .load(testFile(carsFile)) | ||
|
|
||
| assert(cars.select("year").collect().size === 2) | ||
| assert(cars.collect().size === 2) | ||
|
||
| } | ||
| } | ||
|
|
||
|
|
@@ -1322,4 +1322,31 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te | |
| val sampled = spark.read.option("inferSchema", true).option("samplingRatio", 1.0).csv(ds) | ||
| assert(sampled.count() == ds.count()) | ||
| } | ||
|
|
||
| test("SPARK-24244: Select a subset of all columns") { | ||
| withTempPath { path => | ||
| import collection.JavaConverters._ | ||
| val schema = new StructType() | ||
| .add("f1", IntegerType).add("f2", IntegerType).add("f3", IntegerType) | ||
| .add("f4", IntegerType).add("f5", IntegerType).add("f6", IntegerType) | ||
| .add("f7", IntegerType).add("f8", IntegerType).add("f9", IntegerType) | ||
| .add("f10", IntegerType).add("f11", IntegerType).add("f12", IntegerType) | ||
| .add("f13", IntegerType).add("f14", IntegerType).add("f15", IntegerType) | ||
|
|
||
| val odf = spark.createDataFrame(List( | ||
| Row(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), | ||
| Row(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15) | ||
| ).asJava, schema) | ||
| odf.write.csv(path.getCanonicalPath) | ||
| val idf = spark.read | ||
| .schema(schema) | ||
| .csv(path.getCanonicalPath) | ||
| .select('f15, 'f10, 'f5) | ||
|
|
||
| checkAnswer( | ||
| idf, | ||
| List(Row(15, 10, 5), Row(-15, -10, -5)) | ||
| ) | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall we add some more examples? For example, I guess now df.count() with dropmalformed give a different number too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we follow the style of other migration guides?