-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11905] [SQL] Support Persist/Cache and Unpersist in Dataset APIs #9889
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 22 commits
01e4cdf
6835704
9180687
b38a21e
d2b84af
fda8025
ac0dccd
6e0018b
0546772
b37a64f
f061671
88d5e9d
c135e1f
661260b
2517777
aa5dc52
2dfa0fd
c4489ed
683fa6f
1c82396
d929d9b
92ede39
8071d30
b9518ee
b8d287a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -338,6 +338,15 @@ class SQLContext private[sql]( | |
| cacheManager.lookupCachedData(table(tableName)).nonEmpty | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if the [[Queryable]] is currently cached in-memory. | ||
| * @group cachemgmt | ||
| * @since 1.3.0 | ||
| */ | ||
| def isCached(qName: Queryable): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets make this |
||
| cacheManager.lookupCachedData(qName).nonEmpty | ||
| } | ||
|
|
||
| /** | ||
| * Caches the specified table in-memory. | ||
| * @group cachemgmt | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import scala.language.postfixOps | ||
|
|
||
| import org.apache.spark.sql.functions._ | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
|
|
||
|
|
||
| class CacheSuite extends QueryTest with SharedSQLContext { | ||
|
||
| import testImplicits._ | ||
|
|
||
| test("persist and unpersist") { | ||
| val ds = Seq(("a", 1) , ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int]) | ||
| val cached = ds.cache() | ||
| // count triggers the caching action. It should not throw. | ||
| cached.count() | ||
| // Make sure, the Dataset is indeed cached. | ||
| assertCached(cached) | ||
| // Check result. | ||
| checkAnswer( | ||
| cached, | ||
| 2, 3, 4) | ||
| // Drop the cache. | ||
| cached.unpersist() | ||
| assert(!sqlContext.isCached(cached), "The Dataset should not be cached.") | ||
| } | ||
|
|
||
| test("persist and then rebind right encoder when join 2 datasets") { | ||
| val ds1 = Seq("1", "2").toDS().as("a") | ||
| val ds2 = Seq(2, 3).toDS().as("b") | ||
|
|
||
| ds1.persist() | ||
| assertCached(ds1) | ||
| ds2.persist() | ||
| assertCached(ds2) | ||
|
|
||
| val joined = ds1.joinWith(ds2, $"a.value" === $"b.value") | ||
|
||
| checkAnswer(joined, ("2", 2)) | ||
|
|
||
| ds1.unpersist() | ||
| assert(!sqlContext.isCached(ds1), "The Dataset ds1 should not be cached.") | ||
| ds2.unpersist() | ||
| assert(!sqlContext.isCached(ds2), "The Dataset ds2 should not be cached.") | ||
| } | ||
|
|
||
| test("persist and then groupBy columns asKey, map") { | ||
| val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() | ||
| val grouped = ds.groupBy($"_1").keyAs[String] | ||
| val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) } | ||
| agged.persist() | ||
|
|
||
| checkAnswer( | ||
| agged.filter(_._1 == "b"), | ||
|
||
| ("b", 3)) | ||
|
|
||
| ds.unpersist() | ||
| assert(!sqlContext.isCached(ds), "The Dataset ds should not be cached.") | ||
| agged.unpersist() | ||
| assert(!sqlContext.isCached(agged), "The Dataset agged should not be cached.") | ||
| } | ||
|
|
||
| ignore("persist and then map/filter with lambda functions") { | ||
|
||
| val f = (i: Int) => i + 1 | ||
|
|
||
| val ds = Seq(1, 2, 3).toDS() | ||
| val mapped = ds.map(f) | ||
| mapped.cache() | ||
|
|
||
| val mapped2 = ds.map(f) | ||
| assertCached(mapped2) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment style here is off and we should actually have a description. Could we just move the functions/docs from DataFrame to Queryable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So far, we are unable to move the functions to
Queryablebecause the types of the returned values are different. I just added the descriptions in bothDataFrameandDataset. Hopefully, it resolves your concern. Thanks!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@marmbrus moving functions into Queryable actually breaks both scaladoc and javadoc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rxin I think thats only because we explicitly exclude execution from scaladoc. Maybe we should move queryable? or don't exclude that class. I don't want to duplicate a ton of docs.