@@ -190,9 +190,9 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits
190190
191191 df1.unpersist(blocking = true )
192192
193- // df1 un-cached; df2's cache plan re-compiled
193+ // df1 un-cached; df2's cache plan stays the same
194194 assert(df1.storageLevel == StorageLevel .NONE )
195- assertCacheDependency(df1.groupBy(' a ).agg(sum(' b )), 0 )
195+ assertCacheDependency(df1.groupBy(' a ).agg(sum(' b )))
196196
197197 val df4 = df1.groupBy(' a ).agg(sum(' b )).agg(sum(" sum(b)" ))
198198 assertCached(df4)
@@ -206,4 +206,44 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits
206206 // first time use, load cache
207207 checkDataset(df5, Row (10 ))
208208 }
209+
210+ test(" SPARK-26708 Cache data and cached plan should stay consistent" ) {
211+ val df = spark.range(0 , 5 ).toDF(" a" )
212+ val df1 = df.withColumn(" b" , ' a + 1 )
213+ val df2 = df.filter(' a > 1 )
214+
215+ df.cache()
216+ // Add df1 to the CacheManager; the buffer is currently empty.
217+ df1.cache()
218+ // After calling collect(), df1's buffer has been loaded.
219+ df1.collect()
220+ // Add df2 to the CacheManager; the buffer is currently empty.
221+ df2.cache()
222+
223+ // Verify that df1 is a InMemoryRelation plan with dependency on another cached plan.
224+ assertCacheDependency(df1)
225+ val df1InnerPlan = df1.queryExecution.withCachedData
226+ .asInstanceOf [InMemoryRelation ].cacheBuilder.cachedPlan
227+ // Verify that df2 is a InMemoryRelation plan with dependency on another cached plan.
228+ assertCacheDependency(df2)
229+
230+ df.unpersist(blocking = true )
231+
232+ // Verify that df1's cache has stayed the same, since df1's cache already has data
233+ // before df.unpersist().
234+ val df1Limit = df1.limit(2 )
235+ val df1LimitInnerPlan = df1Limit.queryExecution.withCachedData.collectFirst {
236+ case i : InMemoryRelation => i.cacheBuilder.cachedPlan
237+ }
238+ assert(df1LimitInnerPlan.isDefined && df1LimitInnerPlan.get == df1InnerPlan)
239+
240+ // Verify that df2's cache has been re-cached, with a new physical plan rid of dependency
241+ // on df, since df2's cache had not been loaded before df.unpersist().
242+ val df2Limit = df2.limit(2 )
243+ val df2LimitInnerPlan = df2Limit.queryExecution.withCachedData.collectFirst {
244+ case i : InMemoryRelation => i.cacheBuilder.cachedPlan
245+ }
246+ assert(df2LimitInnerPlan.isDefined &&
247+ df2LimitInnerPlan.get.find(_.isInstanceOf [InMemoryTableScanExec ]).isEmpty)
248+ }
209249}
0 commit comments