From ed16a1a019c486551f05ec6f9eedd0a12c99af4f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 7 May 2021 18:59:58 -0700 Subject: [PATCH 01/30] [SPARK-35345][SQL] Add BloomFilter Benchmark test for Parquet --- .../benchmark/BloomFilterBenchmark.scala | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index f78ccf9569a0f..af0e114eed8c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -19,14 +19,13 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random +import org.apache.parquet.hadoop.ParquetOutputFormat + import org.apache.spark.benchmark.Benchmark /** * Benchmark to measure read performance with Bloom filters. * - * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes - * available. - * * To run this benchmark: * {{{ * 1. without sbt: bin/spark-submit --class @@ -43,7 +42,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df = spark.range(N).map(_ => Random.nextInt) - private def writeBenchmark(): Unit = { + private def writeORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -61,7 +60,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } - private def readBenchmark(): Unit = { + private def readORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -81,8 +80,50 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def writeParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"Parquet Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").parquet(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .parquet(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.parquet(path + "/withoutBF") + df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .parquet(path + "/withBF") + + runBenchmark(s"Parquet Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.parquet(path + "/withoutBF").where("value = 0").noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.parquet(path + "/withBF").where("value = 0").noop() + } + benchmark.run() + } + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - writeBenchmark() - readBenchmark() + writeORCBenchmark() + readORCBenchmark() + writeParquetBenchmark() + readParquetBenchmark() } } From fa02810942ef932eda6bc22723d90c5fac7214de Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 7 May 2021 19:44:33 -0700 Subject: [PATCH 02/30] enable ParquetInputFormat bloom filter to true --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index af0e114eed8c1..9dcf48c37ff2d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -18,9 +18,7 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random - -import org.apache.parquet.hadoop.ParquetOutputFormat - +import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} import org.apache.spark.benchmark.Benchmark /** @@ -113,7 +111,8 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { spark.read.parquet(path + "/withoutBF").where("value = 0").noop() } benchmark.addCase("With bloom filter") { _ => - spark.read.parquet(path + "/withBF").where("value = 0").noop() + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .parquet(path + "/withBF").where("value = 0").noop() } benchmark.run() } From 1bb6675f6e09e9ca844ec021e310316584bdf876 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 7 May 2021 19:50:35 -0700 Subject: [PATCH 03/30] fix lint scala --- .../spark/sql/execution/benchmark/BloomFilterBenchmark.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 9dcf48c37ff2d..cf2f621390526 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random + import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} + import org.apache.spark.benchmark.Benchmark /** From 2a3f5eb1e6a698b3627e5692211319fba8762582 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sat, 8 May 2021 09:03:46 -0700 Subject: [PATCH 04/30] update benchmark test result --- .../BloomFilterBenchmark-jdk11-results.txt | 38 +++++++++++++++---- .../BloomFilterBenchmark-results.txt | 38 +++++++++++++++---- 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 1e3abebef021f..0b2de11499eda 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -2,23 +2,45 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19503 19621 166 5.1 195.0 1.0X -With bloom filter 22472 22710 335 4.4 224.7 0.9X +Without bloom filter 19315 19365 70 5.2 193.2 1.0X +With bloom filter 21847 22218 524 4.6 218.5 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1981 2040 82 50.5 19.8 1.0X -With bloom filter 1428 1467 54 70.0 14.3 1.4X +Without bloom filter 1930 1942 17 51.8 19.3 1.0X +With bloom filter 1218 1316 139 82.1 12.2 1.6X +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 19394 19418 33 5.2 193.9 1.0X +With bloom filter 24675 24758 116 4.1 246.8 0.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 1201 1230 41 83.3 12.0 1.0X +With bloom filter 1262 1301 54 79.2 12.6 1.0X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 149d3bf76f770..68ca0909c1e32 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,23 +2,45 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 14922 15194 384 6.7 149.2 1.0X -With bloom filter 17270 17665 559 5.8 172.7 0.9X +Without bloom filter 19501 19669 238 5.1 195.0 1.0X +With bloom filter 22396 22473 109 4.5 224.0 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1463 1486 32 68.4 14.6 1.0X -With bloom filter 1232 1239 9 81.2 12.3 1.2X +Without bloom filter 1652 1657 7 60.5 16.5 1.0X +With bloom filter 1197 1221 34 83.5 12.0 1.4X +================================================================================================ +Parquet Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 17395 17441 64 5.7 174.0 1.0X +With bloom filter 26733 26838 149 3.7 267.3 0.7X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 981 994 15 102.0 9.8 1.0X +With bloom filter 955 979 28 104.7 9.6 1.0X From 10d7a977391d659d2060ba596c55d0334754866c Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 9 May 2021 13:17:25 -0700 Subject: [PATCH 05/30] set parquet block size --- .../benchmark/BloomFilterBenchmark.scala | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index cf2f621390526..efca231a91e5c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -100,23 +100,29 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } private def readParquetBenchmark(): Unit = { - withTempPath { dir => - val path = dir.getCanonicalPath - - df.write.parquet(path + "/withoutBF") - df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) - .parquet(path + "/withBF") - - runBenchmark(s"Parquet Read") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) - benchmark.addCase("Without bloom filter") { _ => - spark.read.parquet(path + "/withoutBF").where("value = 0").noop() - } - benchmark.addCase("With bloom filter") { _ => - spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) - .parquet(path + "/withBF").where("value = 0").noop() + val blockSizes = Seq(512 * 1024, 1024 * 1024, 2 * 1024 * 1024, 3 * 1024 * 1024, + 4 * 1024 * 1024, 5 * 1024 * 1024, 6 * 1024 * 1024, 7 * 1024 * 1024, + 8 * 1024 * 1024, 9 * 1024 * 1024, 10 * 1024 * 1024) + for (blocksize <- blockSizes) { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.parquet(path + "/withoutBF") + df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.block.size", blocksize) + .parquet(path + "/withBF") + + runBenchmark(s"Parquet Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter, blocksize: " + blocksize) { _ => + spark.read.parquet(path + "/withoutBF").where("value = 0").noop() + } + benchmark.addCase("With bloom filter, blocksize: " + blocksize) { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .parquet(path + "/withBF").where("value = 0").noop() + } + benchmark.run() } - benchmark.run() } } } From c8375d6eb0a24828cd9d1c2b7c09459656121867 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 9 May 2021 14:37:02 -0700 Subject: [PATCH 06/30] update benchmark result' --- .../BloomFilterBenchmark-jdk11-results.txt | 144 ++++++++++++++++-- .../BloomFilterBenchmark-results.txt | 144 ++++++++++++++++-- 2 files changed, 264 insertions(+), 24 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 0b2de11499eda..733e0ecb9164e 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19315 19365 70 5.2 193.2 1.0X -With bloom filter 21847 22218 524 4.6 218.5 0.9X +Without bloom filter 18388 18445 81 5.4 183.9 1.0X +With bloom filter 21555 21617 87 4.6 215.6 0.9X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1930 1942 17 51.8 19.3 1.0X -With bloom filter 1218 1316 139 82.1 12.2 1.6X +Without bloom filter 1841 1898 81 54.3 18.4 1.0X +With bloom filter 1558 1696 194 64.2 15.6 1.2X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19394 19418 33 5.2 193.9 1.0X -With bloom filter 24675 24758 116 4.1 246.8 0.8X +Without bloom filter 18525 18692 236 5.4 185.2 1.0X +With bloom filter 25506 25558 73 3.9 255.1 0.7X ================================================================================================ @@ -39,8 +39,128 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1201 1230 41 83.3 12.0 1.0X -With bloom filter 1262 1301 54 79.2 12.6 1.0X +Without bloom filter, blocksize: 524288 1214 1328 161 82.4 12.1 1.0X +With bloom filter, blocksize: 524288 1192 1358 235 83.9 11.9 1.0X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 1048576 1114 1146 45 89.8 11.1 1.0X +With bloom filter, blocksize: 1048576 634 705 123 157.8 6.3 1.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 1071 1099 40 93.4 10.7 1.0X +With bloom filter, blocksize: 2097152 468 484 10 213.6 4.7 2.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 3145728 1081 1103 32 92.5 10.8 1.0X +With bloom filter, blocksize: 3145728 332 369 44 301.6 3.3 3.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 1057 1070 20 94.7 10.6 1.0X +With bloom filter, blocksize: 4194304 295 342 31 339.3 2.9 3.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 5242880 1006 1049 60 99.4 10.1 1.0X +With bloom filter, blocksize: 5242880 237 278 22 422.5 2.4 4.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 988 1040 74 101.2 9.9 1.0X +With bloom filter, blocksize: 6291456 268 305 27 373.6 2.7 3.7X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 7340032 1004 1031 38 99.6 10.0 1.0X +With bloom filter, blocksize: 7340032 476 547 60 210.2 4.8 2.1X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 1452 1469 25 68.9 14.5 1.0X +With bloom filter, blocksize: 8388608 658 677 17 152.0 6.6 2.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 9437184 1456 1477 30 68.7 14.6 1.0X +With bloom filter, blocksize: 9437184 779 825 40 128.4 7.8 1.9X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 10485760 1472 1474 3 68.0 14.7 1.0X +With bloom filter, blocksize: 10485760 777 830 65 128.7 7.8 1.9X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 68ca0909c1e32..775e5c5b80f4b 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19501 19669 238 5.1 195.0 1.0X -With bloom filter 22396 22473 109 4.5 224.0 0.9X +Without bloom filter 16562 16649 123 6.0 165.6 1.0X +With bloom filter 19251 19456 290 5.2 192.5 0.9X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1652 1657 7 60.5 16.5 1.0X -With bloom filter 1197 1221 34 83.5 12.0 1.4X +Without bloom filter 1522 1532 15 65.7 15.2 1.0X +With bloom filter 1207 1219 16 82.8 12.1 1.3X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 17395 17441 64 5.7 174.0 1.0X -With bloom filter 26733 26838 149 3.7 267.3 0.7X +Without bloom filter 15015 15028 18 6.7 150.2 1.0X +With bloom filter 27149 27223 106 3.7 271.5 0.6X ================================================================================================ @@ -39,8 +39,128 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 981 994 15 102.0 9.8 1.0X -With bloom filter 955 979 28 104.7 9.6 1.0X +Without bloom filter, blocksize: 524288 1035 1039 6 96.7 10.3 1.0X +With bloom filter, blocksize: 524288 848 878 26 117.9 8.5 1.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 1048576 998 1012 21 100.2 10.0 1.0X +With bloom filter, blocksize: 1048576 459 484 29 218.1 4.6 2.2X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 2097152 948 961 18 105.5 9.5 1.0X +With bloom filter, blocksize: 2097152 275 288 13 363.3 2.8 3.4X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 3145728 953 964 11 104.9 9.5 1.0X +With bloom filter, blocksize: 3145728 223 236 16 448.3 2.2 4.3X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 4194304 945 966 18 105.8 9.5 1.0X +With bloom filter, blocksize: 4194304 195 209 15 511.8 2.0 4.8X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 5242880 984 996 13 101.7 9.8 1.0X +With bloom filter, blocksize: 5242880 214 224 12 468.1 2.1 4.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 6291456 978 996 19 102.3 9.8 1.0X +With bloom filter, blocksize: 6291456 281 289 10 355.6 2.8 3.5X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 7340032 989 1008 19 101.1 9.9 1.0X +With bloom filter, blocksize: 7340032 337 345 6 296.8 3.4 2.9X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 8388608 1005 1011 8 99.5 10.0 1.0X +With bloom filter, blocksize: 8388608 385 404 20 259.6 3.9 2.6X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter, blocksize: 9437184 992 1002 14 100.8 9.9 1.0X +With bloom filter, blocksize: 9437184 521 538 16 191.9 5.2 1.9X + + +================================================================================================ +Parquet Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 10485760 969 998 27 103.2 9.7 1.0X +With bloom filter, blocksize: 10485760 487 510 16 205.3 4.9 2.0X From 34d05113d307395bd1c1449651e09a8285fd0c6e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 9 May 2021 14:54:29 -0700 Subject: [PATCH 07/30] set parquet.block.size for withoutBF --- .../spark/sql/execution/benchmark/BloomFilterBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index efca231a91e5c..f7828ad788144 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -107,7 +107,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - df.write.parquet(path + "/withoutBF") + df.write.option("parquet.block.size", blocksize).parquet(path + "/withoutBF") df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) .option("parquet.block.size", blocksize) .parquet(path + "/withBF") From 21cc2ac907ffe9256942d818663ce225d1a1b992 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 9 May 2021 16:33:10 -0700 Subject: [PATCH 08/30] update results --- .../BloomFilterBenchmark-jdk11-results.txt | 84 +++++++++---------- .../BloomFilterBenchmark-results.txt | 84 +++++++++---------- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 733e0ecb9164e..258a28f730eba 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 18388 18445 81 5.4 183.9 1.0X -With bloom filter 21555 21617 87 4.6 215.6 0.9X +Without bloom filter 21931 21987 79 4.6 219.3 1.0X +With bloom filter 24876 25137 369 4.0 248.8 0.9X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1841 1898 81 54.3 18.4 1.0X -With bloom filter 1558 1696 194 64.2 15.6 1.2X +Without bloom filter 1694 1753 82 59.0 16.9 1.0X +With bloom filter 1346 1355 13 74.3 13.5 1.3X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 18525 18692 236 5.4 185.2 1.0X -With bloom filter 25506 25558 73 3.9 255.1 0.7X +Without bloom filter 16252 16780 747 6.2 162.5 1.0X +With bloom filter 24145 24686 765 4.1 241.5 0.7X ================================================================================================ @@ -39,11 +39,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 524288 1214 1328 161 82.4 12.1 1.0X -With bloom filter, blocksize: 524288 1192 1358 235 83.9 11.9 1.0X +Without bloom filter, blocksize: 524288 1252 1399 208 79.9 12.5 1.0X +With bloom filter, blocksize: 524288 1036 1203 237 96.5 10.4 1.2X ================================================================================================ @@ -51,11 +51,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 1048576 1114 1146 45 89.8 11.1 1.0X -With bloom filter, blocksize: 1048576 634 705 123 157.8 6.3 1.8X +Without bloom filter, blocksize: 1048576 1346 1413 94 74.3 13.5 1.0X +With bloom filter, blocksize: 1048576 686 786 149 145.8 6.9 2.0X ================================================================================================ @@ -63,11 +63,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 1071 1099 40 93.4 10.7 1.0X -With bloom filter, blocksize: 2097152 468 484 10 213.6 4.7 2.3X +Without bloom filter, blocksize: 2097152 1121 1129 12 89.2 11.2 1.0X +With bloom filter, blocksize: 2097152 385 417 26 260.0 3.8 2.9X ================================================================================================ @@ -75,11 +75,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 1081 1103 32 92.5 10.8 1.0X -With bloom filter, blocksize: 3145728 332 369 44 301.6 3.3 3.3X +Without bloom filter, blocksize: 3145728 1548 1559 16 64.6 15.5 1.0X +With bloom filter, blocksize: 3145728 323 347 35 309.4 3.2 4.8X ================================================================================================ @@ -87,11 +87,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 1057 1070 20 94.7 10.6 1.0X -With bloom filter, blocksize: 4194304 295 342 31 339.3 2.9 3.6X +Without bloom filter, blocksize: 4194304 1337 1366 41 74.8 13.4 1.0X +With bloom filter, blocksize: 4194304 318 345 24 314.4 3.2 4.2X ================================================================================================ @@ -99,11 +99,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 1006 1049 60 99.4 10.1 1.0X -With bloom filter, blocksize: 5242880 237 278 22 422.5 2.4 4.3X +Without bloom filter, blocksize: 5242880 1311 1374 90 76.3 13.1 1.0X +With bloom filter, blocksize: 5242880 380 410 21 263.2 3.8 3.5X ================================================================================================ @@ -111,11 +111,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 988 1040 74 101.2 9.9 1.0X -With bloom filter, blocksize: 6291456 268 305 27 373.6 2.7 3.7X +Without bloom filter, blocksize: 6291456 1292 1332 56 77.4 12.9 1.0X +With bloom filter, blocksize: 6291456 338 407 70 295.8 3.4 3.8X ================================================================================================ @@ -123,11 +123,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 7340032 1004 1031 38 99.6 10.0 1.0X -With bloom filter, blocksize: 7340032 476 547 60 210.2 4.8 2.1X +Without bloom filter, blocksize: 7340032 1353 1381 40 73.9 13.5 1.0X +With bloom filter, blocksize: 7340032 336 400 43 297.5 3.4 4.0X ================================================================================================ @@ -135,11 +135,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 1452 1469 25 68.9 14.5 1.0X -With bloom filter, blocksize: 8388608 658 677 17 152.0 6.6 2.2X +Without bloom filter, blocksize: 8388608 1456 1468 18 68.7 14.6 1.0X +With bloom filter, blocksize: 8388608 662 697 38 150.9 6.6 2.2X ================================================================================================ @@ -147,11 +147,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 9437184 1456 1477 30 68.7 14.6 1.0X -With bloom filter, blocksize: 9437184 779 825 40 128.4 7.8 1.9X +Without bloom filter, blocksize: 9437184 1538 1585 66 65.0 15.4 1.0X +With bloom filter, blocksize: 9437184 653 710 72 153.2 6.5 2.4X ================================================================================================ @@ -159,8 +159,8 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 10485760 1472 1474 3 68.0 14.7 1.0X -With bloom filter, blocksize: 10485760 777 830 65 128.7 7.8 1.9X +Without bloom filter, blocksize: 10485760 1518 1524 9 65.9 15.2 1.0X +With bloom filter, blocksize: 10485760 610 715 93 164.0 6.1 2.5X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 775e5c5b80f4b..cdf6f8d7ea217 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 16562 16649 123 6.0 165.6 1.0X -With bloom filter 19251 19456 290 5.2 192.5 0.9X +Without bloom filter 18561 18648 123 5.4 185.6 1.0X +With bloom filter 21399 21410 16 4.7 214.0 0.9X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1522 1532 15 65.7 15.2 1.0X -With bloom filter 1207 1219 16 82.8 12.1 1.3X +Without bloom filter 1581 1582 2 63.3 15.8 1.0X +With bloom filter 1134 1149 21 88.2 11.3 1.4X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 15015 15028 18 6.7 150.2 1.0X -With bloom filter 27149 27223 106 3.7 271.5 0.6X +Without bloom filter 14265 14348 117 7.0 142.6 1.0X +With bloom filter 28544 28717 244 3.5 285.4 0.5X ================================================================================================ @@ -39,11 +39,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 524288 1035 1039 6 96.7 10.3 1.0X -With bloom filter, blocksize: 524288 848 878 26 117.9 8.5 1.2X +Without bloom filter, blocksize: 524288 985 1009 34 101.5 9.9 1.0X +With bloom filter, blocksize: 524288 765 780 13 130.7 7.6 1.3X ================================================================================================ @@ -51,11 +51,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 1048576 998 1012 21 100.2 10.0 1.0X -With bloom filter, blocksize: 1048576 459 484 29 218.1 4.6 2.2X +Without bloom filter, blocksize: 1048576 898 900 2 111.4 9.0 1.0X +With bloom filter, blocksize: 1048576 416 431 11 240.6 4.2 2.2X ================================================================================================ @@ -63,11 +63,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 948 961 18 105.5 9.5 1.0X -With bloom filter, blocksize: 2097152 275 288 13 363.3 2.8 3.4X +Without bloom filter, blocksize: 2097152 847 854 9 118.1 8.5 1.0X +With bloom filter, blocksize: 2097152 250 263 8 399.3 2.5 3.4X ================================================================================================ @@ -75,11 +75,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 953 964 11 104.9 9.5 1.0X -With bloom filter, blocksize: 3145728 223 236 16 448.3 2.2 4.3X +Without bloom filter, blocksize: 3145728 857 878 18 116.7 8.6 1.0X +With bloom filter, blocksize: 3145728 194 210 14 515.1 1.9 4.4X ================================================================================================ @@ -87,11 +87,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 945 966 18 105.8 9.5 1.0X -With bloom filter, blocksize: 4194304 195 209 15 511.8 2.0 4.8X +Without bloom filter, blocksize: 4194304 875 877 2 114.3 8.8 1.0X +With bloom filter, blocksize: 4194304 181 197 12 552.6 1.8 4.8X ================================================================================================ @@ -99,11 +99,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 984 996 13 101.7 9.8 1.0X -With bloom filter, blocksize: 5242880 214 224 12 468.1 2.1 4.6X +Without bloom filter, blocksize: 5242880 867 871 4 115.4 8.7 1.0X +With bloom filter, blocksize: 5242880 229 244 13 436.5 2.3 3.8X ================================================================================================ @@ -111,11 +111,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 978 996 19 102.3 9.8 1.0X -With bloom filter, blocksize: 6291456 281 289 10 355.6 2.8 3.5X +Without bloom filter, blocksize: 6291456 865 876 11 115.7 8.6 1.0X +With bloom filter, blocksize: 6291456 225 235 8 444.5 2.2 3.8X ================================================================================================ @@ -123,11 +123,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 7340032 989 1008 19 101.1 9.9 1.0X -With bloom filter, blocksize: 7340032 337 345 6 296.8 3.4 2.9X +Without bloom filter, blocksize: 7340032 853 877 20 117.2 8.5 1.0X +With bloom filter, blocksize: 7340032 353 365 9 283.5 3.5 2.4X ================================================================================================ @@ -135,11 +135,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 1005 1011 8 99.5 10.0 1.0X -With bloom filter, blocksize: 8388608 385 404 20 259.6 3.9 2.6X +Without bloom filter, blocksize: 8388608 880 888 8 113.6 8.8 1.0X +With bloom filter, blocksize: 8388608 331 339 7 302.5 3.3 2.7X ================================================================================================ @@ -147,11 +147,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 9437184 992 1002 14 100.8 9.9 1.0X -With bloom filter, blocksize: 9437184 521 538 16 191.9 5.2 1.9X +Without bloom filter, blocksize: 9437184 868 869 1 115.2 8.7 1.0X +With bloom filter, blocksize: 9437184 515 520 7 194.1 5.2 1.7X ================================================================================================ @@ -159,8 +159,8 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 10485760 969 998 27 103.2 9.7 1.0X -With bloom filter, blocksize: 10485760 487 510 16 205.3 4.9 2.0X +Without bloom filter, blocksize: 10485760 849 874 23 117.8 8.5 1.0X +With bloom filter, blocksize: 10485760 483 493 9 207.1 4.8 1.8X From 47f70b7fa00512e9c971c74ba3714062c7fb216e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 11 May 2021 17:11:17 -0700 Subject: [PATCH 09/30] adding measurements for 16, 64 and 128M --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index f7828ad788144..2c177d07aa800 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -100,9 +100,9 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } private def readParquetBenchmark(): Unit = { - val blockSizes = Seq(512 * 1024, 1024 * 1024, 2 * 1024 * 1024, 3 * 1024 * 1024, - 4 * 1024 * 1024, 5 * 1024 * 1024, 6 * 1024 * 1024, 7 * 1024 * 1024, - 8 * 1024 * 1024, 9 * 1024 * 1024, 10 * 1024 * 1024) + val blockSizes = Seq(2 * 1024 * 1024, 3 * 1024 * 1024, 4 * 1024 * 1024, 5 * 1024 * 1024, + 6 * 1024 * 1024, 8 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024, 64 * 1024 * 1024, + 128 * 1024 * 1024) for (blocksize <- blockSizes) { withTempPath { dir => val path = dir.getCanonicalPath From 05b20af9443b653841a6497eb52aef40fb6058e3 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 11 May 2021 18:02:42 -0700 Subject: [PATCH 10/30] update test result --- .../BloomFilterBenchmark-jdk11-results.txt | 104 ++++++++---------- .../BloomFilterBenchmark-results.txt | 104 ++++++++---------- 2 files changed, 94 insertions(+), 114 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 258a28f730eba..04dcc36033b3c 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 21931 21987 79 4.6 219.3 1.0X -With bloom filter 24876 25137 369 4.0 248.8 0.9X +Without bloom filter 20066 20165 139 5.0 200.7 1.0X +With bloom filter 22784 22937 217 4.4 227.8 0.9X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1694 1753 82 59.0 16.9 1.0X -With bloom filter 1346 1355 13 74.3 13.5 1.3X +Without bloom filter 1732 1777 64 57.7 17.3 1.0X +With bloom filter 1287 1295 10 77.7 12.9 1.3X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 16252 16780 747 6.2 162.5 1.0X -With bloom filter 24145 24686 765 4.1 241.5 0.7X +Without bloom filter 18804 18870 93 5.3 188.0 1.0X +With bloom filter 23439 23667 323 4.3 234.4 0.8X ================================================================================================ @@ -39,11 +39,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 524288 1252 1399 208 79.9 12.5 1.0X -With bloom filter, blocksize: 524288 1036 1203 237 96.5 10.4 1.2X +Without bloom filter, blocksize: 2097152 1550 1733 259 64.5 15.5 1.0X +With bloom filter, blocksize: 2097152 350 500 119 285.3 3.5 4.4X ================================================================================================ @@ -51,11 +51,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 1048576 1346 1413 94 74.3 13.5 1.0X -With bloom filter, blocksize: 1048576 686 786 149 145.8 6.9 2.0X +Without bloom filter, blocksize: 3145728 1256 1339 117 79.6 12.6 1.0X +With bloom filter, blocksize: 3145728 333 388 35 300.0 3.3 3.8X ================================================================================================ @@ -63,11 +63,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 1121 1129 12 89.2 11.2 1.0X -With bloom filter, blocksize: 2097152 385 417 26 260.0 3.8 2.9X +Without bloom filter, blocksize: 4194304 1240 1259 28 80.7 12.4 1.0X +With bloom filter, blocksize: 4194304 243 297 34 411.4 2.4 5.1X ================================================================================================ @@ -75,11 +75,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 1548 1559 16 64.6 15.5 1.0X -With bloom filter, blocksize: 3145728 323 347 35 309.4 3.2 4.8X +Without bloom filter, blocksize: 5242880 1345 1380 49 74.3 13.5 1.0X +With bloom filter, blocksize: 5242880 352 433 55 284.1 3.5 3.8X ================================================================================================ @@ -87,11 +87,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 1337 1366 41 74.8 13.4 1.0X -With bloom filter, blocksize: 4194304 318 345 24 314.4 3.2 4.2X +Without bloom filter, blocksize: 6291456 1272 1310 54 78.6 12.7 1.0X +With bloom filter, blocksize: 6291456 356 404 30 280.9 3.6 3.6X ================================================================================================ @@ -99,11 +99,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 1311 1374 90 76.3 13.1 1.0X -With bloom filter, blocksize: 5242880 380 410 21 263.2 3.8 3.5X +Without bloom filter, blocksize: 8388608 1355 1410 78 73.8 13.5 1.0X +With bloom filter, blocksize: 8388608 603 632 25 165.8 6.0 2.2X ================================================================================================ @@ -111,11 +111,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 6291456 1292 1332 56 77.4 12.9 1.0X -With bloom filter, blocksize: 6291456 338 407 70 295.8 3.4 3.8X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 1315 1404 126 76.0 13.2 1.0X +With bloom filter, blocksize: 16777216 1161 1235 104 86.1 11.6 1.1X ================================================================================================ @@ -123,11 +123,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 7340032 1353 1381 40 73.9 13.5 1.0X -With bloom filter, blocksize: 7340032 336 400 43 297.5 3.4 4.0X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 1332 1394 89 75.1 13.3 1.0X +With bloom filter, blocksize: 33554432 1283 1338 77 77.9 12.8 1.0X ================================================================================================ @@ -135,11 +135,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 8388608 1456 1468 18 68.7 14.6 1.0X -With bloom filter, blocksize: 8388608 662 697 38 150.9 6.6 2.2X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 67108864 1323 1325 3 75.6 13.2 1.0X +With bloom filter, blocksize: 67108864 1320 1335 21 75.7 13.2 1.0X ================================================================================================ @@ -147,20 +147,10 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 9437184 1538 1585 66 65.0 15.4 1.0X -With bloom filter, blocksize: 9437184 653 710 72 153.2 6.5 2.4X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 134217728 1188 1221 48 84.2 11.9 1.0X +With bloom filter, blocksize: 134217728 1221 1248 39 81.9 12.2 1.0X -================================================================================================ -Parquet Read -================================================================================================ - -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 10485760 1518 1524 9 65.9 15.2 1.0X -With bloom filter, blocksize: 10485760 610 715 93 164.0 6.1 2.5X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index cdf6f8d7ea217..ba5c699aeaa99 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -3,11 +3,11 @@ ORC Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 18561 18648 123 5.4 185.6 1.0X -With bloom filter 21399 21410 16 4.7 214.0 0.9X +Without bloom filter 19398 19534 192 5.2 194.0 1.0X +With bloom filter 24950 24986 51 4.0 249.5 0.8X ================================================================================================ @@ -15,11 +15,11 @@ ORC Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1581 1582 2 63.3 15.8 1.0X -With bloom filter 1134 1149 21 88.2 11.3 1.4X +Without bloom filter 1554 1604 71 64.4 15.5 1.0X +With bloom filter 1218 1240 32 82.1 12.2 1.3X ================================================================================================ @@ -27,11 +27,11 @@ Parquet Write ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 14265 14348 117 7.0 142.6 1.0X -With bloom filter 28544 28717 244 3.5 285.4 0.5X +Without bloom filter 17246 17265 27 5.8 172.5 1.0X +With bloom filter 26679 26680 1 3.7 266.8 0.6X ================================================================================================ @@ -39,11 +39,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 524288 985 1009 34 101.5 9.9 1.0X -With bloom filter, blocksize: 524288 765 780 13 130.7 7.6 1.3X +Without bloom filter, blocksize: 2097152 956 975 18 104.7 9.6 1.0X +With bloom filter, blocksize: 2097152 281 301 13 355.5 2.8 3.4X ================================================================================================ @@ -51,11 +51,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 1048576 898 900 2 111.4 9.0 1.0X -With bloom filter, blocksize: 1048576 416 431 11 240.6 4.2 2.2X +Without bloom filter, blocksize: 3145728 844 858 13 118.5 8.4 1.0X +With bloom filter, blocksize: 3145728 216 224 8 463.9 2.2 3.9X ================================================================================================ @@ -63,11 +63,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 847 854 9 118.1 8.5 1.0X -With bloom filter, blocksize: 2097152 250 263 8 399.3 2.5 3.4X +Without bloom filter, blocksize: 4194304 845 860 14 118.3 8.4 1.0X +With bloom filter, blocksize: 4194304 210 223 15 476.1 2.1 4.0X ================================================================================================ @@ -75,11 +75,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 857 878 18 116.7 8.6 1.0X -With bloom filter, blocksize: 3145728 194 210 14 515.1 1.9 4.4X +Without bloom filter, blocksize: 5242880 837 845 8 119.5 8.4 1.0X +With bloom filter, blocksize: 5242880 228 236 14 438.9 2.3 3.7X ================================================================================================ @@ -87,11 +87,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 875 877 2 114.3 8.8 1.0X -With bloom filter, blocksize: 4194304 181 197 12 552.6 1.8 4.8X +Without bloom filter, blocksize: 6291456 846 853 12 118.3 8.5 1.0X +With bloom filter, blocksize: 6291456 273 284 11 365.7 2.7 3.1X ================================================================================================ @@ -99,11 +99,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 867 871 4 115.4 8.7 1.0X -With bloom filter, blocksize: 5242880 229 244 13 436.5 2.3 3.8X +Without bloom filter, blocksize: 8388608 843 852 8 118.7 8.4 1.0X +With bloom filter, blocksize: 8388608 376 397 18 266.0 3.8 2.2X ================================================================================================ @@ -111,11 +111,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 6291456 865 876 11 115.7 8.6 1.0X -With bloom filter, blocksize: 6291456 225 235 8 444.5 2.2 3.8X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 16777216 881 886 4 113.4 8.8 1.0X +With bloom filter, blocksize: 16777216 759 760 1 131.7 7.6 1.2X ================================================================================================ @@ -123,11 +123,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 7340032 853 877 20 117.2 8.5 1.0X -With bloom filter, blocksize: 7340032 353 365 9 283.5 3.5 2.4X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 33554432 861 870 8 116.1 8.6 1.0X +With bloom filter, blocksize: 33554432 886 901 13 112.9 8.9 1.0X ================================================================================================ @@ -135,11 +135,11 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 8388608 880 888 8 113.6 8.8 1.0X -With bloom filter, blocksize: 8388608 331 339 7 302.5 3.3 2.7X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 67108864 866 872 8 115.4 8.7 1.0X +With bloom filter, blocksize: 67108864 867 871 4 115.3 8.7 1.0X ================================================================================================ @@ -147,20 +147,10 @@ Parquet Read ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 9437184 868 869 1 115.2 8.7 1.0X -With bloom filter, blocksize: 9437184 515 520 7 194.1 5.2 1.7X +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Without bloom filter, blocksize: 134217728 866 890 20 115.4 8.7 1.0X +With bloom filter, blocksize: 134217728 867 878 12 115.3 8.7 1.0X -================================================================================================ -Parquet Read -================================================================================================ - -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 10485760 849 874 23 117.8 8.5 1.0X -With bloom filter, blocksize: 10485760 483 493 9 207.1 4.8 1.8X From df044b3d76e5be496d86ad0f47e91cf6bb20becb Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 26 May 2021 23:36:03 -0700 Subject: [PATCH 11/30] add benchmark test for IN set with 3000 predicates --- .../benchmark/BloomFilterBenchmark.scala | 79 ++++++++++++++++--- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 2c177d07aa800..71e0509161d8f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -17,11 +17,14 @@ package org.apache.spark.sql.execution.benchmark +import java.util.UUID + import scala.util.Random import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetOutputFormat} import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.functions.col /** * Benchmark to measure read performance with Bloom filters. @@ -40,7 +43,9 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val scaleFactor = 100 private val N = scaleFactor * 1000 * 1000 - private val df = spark.range(N).map(_ => Random.nextInt) + private val df1 = spark.range(N).map(_ => Random.nextInt) + + private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF private def writeORCBenchmark(): Unit = { withTempPath { dir => @@ -49,10 +54,10 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { runBenchmark(s"ORC Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => - df.write.mode("overwrite").orc(path + "/withoutBF") + df1.write.mode("overwrite").orc(path + "/withoutBF") } benchmark.addCase("With bloom filter") { _ => - df.write.mode("overwrite") + df1.write.mode("overwrite") .option("orc.bloom.filter.columns", "value").orc(path + "/withBF") } benchmark.run() @@ -64,8 +69,8 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - df.write.orc(path + "/withoutBF") - df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + df1.write.orc(path + "/withoutBF") + df1.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") runBenchmark(s"ORC Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) @@ -80,6 +85,29 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def readORCBenchmarkForInSet(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + val samples = df2.sample(0.00003, 128).select("value").as[String].collect() + val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" + + df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") + df2.repartition(col("value")).sort(col("value")) + .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + + runBenchmark(s"ORC Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.orc(path + "/withoutBF").where(filter).noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.orc(path + "/withBF").where(filter).noop() + } + benchmark.run() + } + } + } + private def writeParquetBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -87,11 +115,12 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { runBenchmark(s"Parquet Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => - df.write.mode("overwrite").parquet(path + "/withoutBF") + df1.write.mode("overwrite").parquet(path + "/withoutBF") } benchmark.addCase("With bloom filter") { _ => - df.write.mode("overwrite") + df1.write.mode("overwrite") .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") .parquet(path + "/withBF") } benchmark.run() @@ -107,8 +136,9 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - df.write.option("parquet.block.size", blocksize).parquet(path + "/withoutBF") - df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + df1.write.option("parquet.block.size", blocksize).parquet(path + "/withoutBF") + df1.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") .option("parquet.block.size", blocksize) .parquet(path + "/withBF") @@ -127,10 +157,41 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def readParquetBenchmarkForInSet(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + val samples = df2.sample(0.00003, 128).select("value").as[String].collect() + val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" + + df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") + df2.repartition(col("value")).sort(col("value")) + .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") + .parquet(path + "/withBF") + + runBenchmark(s"Parquet Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) + .parquet(path + "/withoutBF").where(filter).noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) + .parquet(path + "/withBF").where(filter).noop + } + benchmark.run() + } + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { writeORCBenchmark() readORCBenchmark() + readORCBenchmarkForInSet() writeParquetBenchmark() readParquetBenchmark() + readParquetBenchmarkForInSet() } } From e582c26661bd2258bdf18355e50e3bc204867402 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 27 May 2021 12:20:23 -0700 Subject: [PATCH 12/30] change the num of predicates in IN set to 300 for now --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 71e0509161d8f..62bd93eb4dd45 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.00003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -161,7 +161,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.00003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") @@ -173,7 +173,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { runBenchmark(s"Parquet Read for IN set") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => - spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) + spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 320) .parquet(path + "/withoutBF").where(filter).noop() } benchmark.addCase("With bloom filter") { _ => From d4d39d3fdecccd3551b07f8249a4015a0420a170 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 27 May 2021 21:15:13 -0700 Subject: [PATCH 13/30] comment out parquet IN set test for now --- .../benchmark/BloomFilterBenchmark.scala | 59 +++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 62bd93eb4dd45..838ded384d6e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.00003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -157,34 +157,34 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } - private def readParquetBenchmarkForInSet(): Unit = { - withTempPath { dir => - val path = dir.getCanonicalPath - - val samples = df2.sample(0.000003, 128).select("value").as[String].collect() - val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" - - df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") - df2.repartition(col("value")).sort(col("value")) - .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) - .option("parquet.bloom.filter.expected.ndv#value", "100000000") - .parquet(path + "/withBF") - - runBenchmark(s"Parquet Read for IN set") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) - benchmark.addCase("Without bloom filter") { _ => - spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 320) - .parquet(path + "/withoutBF").where(filter).noop() - } - benchmark.addCase("With bloom filter") { _ => - spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) - .option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) - .parquet(path + "/withBF").where(filter).noop - } - benchmark.run() - } - } - } +// private def readParquetBenchmarkForInSet(): Unit = { +// withTempPath { dir => +// val path = dir.getCanonicalPath +// +// val samples = df2.sample(0.00003, 128).select("value").as[String].collect() +// val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" +// +// df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") +// df2.repartition(col("value")).sort(col("value")) +// .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) +// .option("parquet.bloom.filter.expected.ndv#value", "100000000") +// .parquet(path + "/withBF") +// +// runBenchmark(s"Parquet Read for IN set") { +// val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) +// benchmark.addCase("Without bloom filter") { _ => +// spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) +// .parquet(path + "/withoutBF").where(filter).noop() +// } +// benchmark.addCase("With bloom filter") { _ => +// spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) +// .option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) +// .parquet(path + "/withBF").where(filter).noop +// } +// benchmark.run() +// } +// } +// } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { writeORCBenchmark() @@ -192,6 +192,5 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { readORCBenchmarkForInSet() writeParquetBenchmark() readParquetBenchmark() - readParquetBenchmarkForInSet() } } From c0e9d97199a88a028133a79d05563b7001255e67 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 28 May 2021 14:55:15 -0700 Subject: [PATCH 14/30] change num of predicate to 30 --- .../benchmark/BloomFilterBenchmark.scala | 59 ++++++++++--------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 838ded384d6e2..d29c6253ab746 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.00003, 128).select("value").as[String].collect() + val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -157,34 +157,34 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } -// private def readParquetBenchmarkForInSet(): Unit = { -// withTempPath { dir => -// val path = dir.getCanonicalPath -// -// val samples = df2.sample(0.00003, 128).select("value").as[String].collect() -// val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" -// -// df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") -// df2.repartition(col("value")).sort(col("value")) -// .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) -// .option("parquet.bloom.filter.expected.ndv#value", "100000000") -// .parquet(path + "/withBF") -// -// runBenchmark(s"Parquet Read for IN set") { -// val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) -// benchmark.addCase("Without bloom filter") { _ => -// spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) -// .parquet(path + "/withoutBF").where(filter).noop() -// } -// benchmark.addCase("With bloom filter") { _ => -// spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) -// .option("spark.sql.parquet.pushdown.inFilterThreshold", 3100) -// .parquet(path + "/withBF").where(filter).noop -// } -// benchmark.run() -// } -// } -// } + private def readParquetBenchmarkForInSet(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() + val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" + + df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") + df2.repartition(col("value")).sort(col("value")) + .write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option("parquet.bloom.filter.expected.ndv#value", "100000000") + .parquet(path + "/withBF") + + runBenchmark(s"Parquet Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) + .parquet(path + "/withoutBF").where(filter).noop() + } + benchmark.addCase("With bloom filter") { _ => + spark.read.option(ParquetInputFormat.BLOOM_FILTERING_ENABLED, true) + .option("spark.sql.parquet.pushdown.inFilterThreshold", 50) + .parquet(path + "/withBF").where(filter).noop + } + benchmark.run() + } + } + } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { writeORCBenchmark() @@ -192,5 +192,6 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { readORCBenchmarkForInSet() writeParquetBenchmark() readParquetBenchmark() + readParquetBenchmarkForInSet() } } From 5fc105f70a50160954b593ac2c1da3010310255e Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Fri, 28 May 2021 12:39:34 -0700 Subject: [PATCH 15/30] [SPARK-35559][TEST] Speed up one test in AdaptiveQueryExecSuite ### What changes were proposed in this pull request? I just noticed that `AdaptiveQueryExecSuite.SPARK-34091: Batch shuffle fetch in AQE partition coalescing` takes more than 10 minutes to finish, which is unacceptable. This PR sets the shuffle partitions to 10 in that test, so that the test can finish with 5 seconds. ### Why are the changes needed? speed up the test ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? N/A Closes #32695 from cloud-fan/test. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 454d3aa148a44..b9515c038f0ec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1513,7 +1513,7 @@ class AdaptiveQueryExecSuite test("SPARK-34091: Batch shuffle fetch in AQE partition coalescing") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", - SQLConf.SHUFFLE_PARTITIONS.key -> "10000", + SQLConf.SHUFFLE_PARTITIONS.key -> "10", SQLConf.FETCH_SHUFFLE_BLOCKS_IN_BATCH.key -> "true") { withTable("t1") { spark.range(100).selectExpr("id + 1 as a").write.format("parquet").saveAsTable("t1") From e5ee38c71fcdca68838caa1e672ec06f5de5fc26 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Thu, 27 May 2021 19:31:56 +0900 Subject: [PATCH 16/30] [SPARK-35535][SQL] New data source V2 API: LocalScan ### What changes were proposed in this pull request? Add a new data source V2 API: `LocalScan`. It is a special Scan that will happen on Driver locally instead of Executors. ### Why are the changes needed? The new API improves the flexibility of the DSV2 API. It allows developers to implement connectors for data sources of small data sizes. For example, we can build a data source for Spark History applications from Spark History Server RESTFUL API. The result set is small and fetching all the results from the Spark driver is good enough. Making it a data source allows us to operate SQL queries with filters or table joins. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test Closes #32678 from gengliangwang/LocalScan. Lead-authored-by: Gengliang Wang Co-authored-by: Gengliang Wang Signed-off-by: Hyukjin Kwon (cherry picked from commit 5bcd1c29f0e8442d73d3985253ccadb6f4078044) --- .../spark/sql/connector/read/LocalScan.java | 31 ++++++ .../datasources/v2/DataSourceV2Strategy.scala | 6 ++ .../spark/sql/connector/LocalScanSuite.scala | 95 +++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 100644 sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala diff --git a/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java b/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java new file mode 100644 index 0000000000000..4573cf5bc2a28 --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector.read; + +import org.apache.spark.annotation.Experimental; +import org.apache.spark.sql.catalyst.InternalRow; + +/** + * A special Scan which will happen on Driver locally instead of Executors. + * + * @since 3.2.0 + */ +@Experimental +public interface LocalScan extends Scan { + InternalRow[] rows(); +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 811f41832d159..7bbd7b4b07645 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.toPrettySQL import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, StagingTableCatalog, SupportsNamespaces, SupportsPartitionManagement, SupportsWrite, Table, TableCapability, TableCatalog, TableChange} +import org.apache.spark.sql.connector.read.LocalScan import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.connector.write.V1Write import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -104,6 +105,11 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat tableIdentifier = None) withProjectAndFilter(project, filters, dsScan, needsUnsafeConversion = false) :: Nil + case PhysicalOperation(project, filters, + DataSourceV2ScanRelation(_, scan: LocalScan, output)) => + val localScanExec = LocalTableScanExec(output, scan.rows().toSeq) + withProjectAndFilter(project, filters, localScanExec, needsUnsafeConversion = false) :: Nil + case PhysicalOperation(project, filters, relation: DataSourceV2ScanRelation) => // projection and filters were already pushed down in the optimizer. // this uses PhysicalOperation to get the projection and ensure that if the batch scan does diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala new file mode 100644 index 0000000000000..db71eeb75eae0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import java.util + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.catalog.{BasicInMemoryTableCatalog, Identifier, SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.read.{LocalScan, Scan, ScanBuilder} +import org.apache.spark.sql.execution.LocalTableScanExec +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class LocalScanSuite extends QueryTest with SharedSparkSession { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") + spark.conf.set("spark.sql.catalog.testcat", classOf[TestLocalScanCatalog].getName) + sql("CREATE TABLE testcat.tbl(i int)") + } + + override def afterAll(): Unit = { + spark.conf.unset(SQLConf.DEFAULT_CATALOG.key) + spark.conf.unset("spark.sql.catalog.testcat") + super.afterAll() + } + + test("full scan") { + val df = spark.table("testcat.tbl") + assert(df.schema == TestLocalScanTable.schema) + + val localScan = df.queryExecution.executedPlan.collect { + case s: LocalTableScanExec => s + } + assert(localScan.length == 1) + checkAnswer(df, TestLocalScanTable.data.map(Row(_))) + } +} + +class TestLocalScanCatalog extends BasicInMemoryTableCatalog { + override def createTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + val table = new TestLocalScanTable(ident.toString) + tables.put(ident, table) + table + } +} + +object TestLocalScanTable { + val schema = new StructType().add("i", "int") + val data = Seq(1, 2, 3) +} + +class TestLocalScanTable(override val name: String) extends Table with SupportsRead { + override def schema(): StructType = TestLocalScanTable.schema + + override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava + + override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = + new TestLocalScanBuilder + + private class TestLocalScanBuilder extends ScanBuilder { + override def build(): Scan = new TestLocalScan + } + + private class TestLocalScan extends LocalScan { + override def rows(): Array[InternalRow] = TestLocalScanTable.data.map(InternalRow(_)).toArray + + override def readSchema(): StructType = TestLocalScanTable.schema + } +} From cf8e6d63a05b314c45f0d427de3f6889d99e8d3c Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 28 May 2021 13:18:44 +0000 Subject: [PATCH 17/30] [SPARK-35194][SQL] Refactor nested column aliasing for readability ### What changes were proposed in this pull request? Refactors `NestedColumnAliasing` and `GeneratorNestedColumnAliasing` for readability. ### Why are the changes needed? Improves readability for future maintenance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes #32301 from karenfeng/refactor-nested-column-aliasing. Authored-by: Karen Feng Signed-off-by: Wenchen Fan --- .../catalyst/expressions/AttributeMap.scala | 6 + .../catalyst/expressions/AttributeMap.scala | 6 + .../optimizer/NestedColumnAliasing.scala | 426 ++++++++++-------- .../sql/catalyst/optimizer/Optimizer.scala | 4 +- .../optimizer/NestedColumnAliasingSuite.scala | 2 +- 5 files changed, 250 insertions(+), 194 deletions(-) diff --git a/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala index 42b92d4593c77..189318acd8661 100644 --- a/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala +++ b/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala @@ -23,6 +23,10 @@ package org.apache.spark.sql.catalyst.expressions * of the name, or the expected nullability). */ object AttributeMap { + def apply[A](kvs: Map[Attribute, A]): AttributeMap[A] = { + new AttributeMap(kvs.map(kv => (kv._1.exprId, kv))) + } + def apply[A](kvs: Seq[(Attribute, A)]): AttributeMap[A] = { new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap) } @@ -37,6 +41,8 @@ class AttributeMap[A](val baseMap: Map[ExprId, (Attribute, A)]) override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2) + override def getOrElse[B1 >: A](k: Attribute, default: => B1): B1 = get(k).getOrElse(default) + override def contains(k: Attribute): Boolean = get(k).isDefined override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] = baseMap.values.toMap + kv diff --git a/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala index e6b53e3e6548f..77152918bf687 100644 --- a/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala +++ b/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala @@ -23,6 +23,10 @@ package org.apache.spark.sql.catalyst.expressions * of the name, or the expected nullability). */ object AttributeMap { + def apply[A](kvs: Map[Attribute, A]): AttributeMap[A] = { + new AttributeMap(kvs.map(kv => (kv._1.exprId, kv))) + } + def apply[A](kvs: Seq[(Attribute, A)]): AttributeMap[A] = { new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap) } @@ -37,6 +41,8 @@ class AttributeMap[A](val baseMap: Map[ExprId, (Attribute, A)]) override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2) + override def getOrElse[B1 >: A](k: Attribute, default: => B1): B1 = get(k).getOrElse(default) + override def contains(k: Attribute): Boolean = get(k).isDefined override def updated[B1 >: A](key: Attribute, value: B1): Map[Attribute, B1] = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 5b12667f4a884..cd7032d555992 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -17,71 +17,151 @@ package org.apache.spark.sql.catalyst.optimizer +import scala.collection.mutable + import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** - * This aims to handle a nested column aliasing pattern inside the `ColumnPruning` optimizer rule. - * If a project or its child references to nested fields, and not all the fields - * in a nested attribute are used, we can substitute them by alias attributes; then a project - * of the nested fields as aliases on the children of the child will be created. + * This aims to handle a nested column aliasing pattern inside the [[ColumnPruning]] optimizer rule. + * If: + * - A [[Project]] or its child references nested fields + * - Not all of the fields in a nested attribute are used + * Then: + * - Substitute the nested field references with alias attributes + * - Add grandchild [[Project]]s transforming the nested fields to aliases + * + * Example 1: Project + * ------------------ + * Before: + * +- Project [concat_ws(s#0.a, s#0.b) AS concat_ws(s.a, s.b)#1] + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- LocalRelation , [s#0] + * After: + * +- Project [concat_ws(_extract_a#2, _extract_b#3) AS concat_ws(s.a, s.b)#1] + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- Project [s#0.a AS _extract_a#2, s#0.b AS _extract_b#3] + * +- LocalRelation , [s#0] + * + * Example 2: Project above Filter + * ------------------------------- + * Before: + * +- Project [s#0.a AS s.a#1] + * +- Filter (length(s#0.b) > 2) + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- LocalRelation , [s#0] + * After: + * +- Project [_extract_a#2 AS s.a#1] + * +- Filter (length(_extract_b#3) > 2) + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- Project [s#0.a AS _extract_a#2, s#0.b AS _extract_b#3] + * +- LocalRelation , [s#0] + * + * Example 3: Nested fields with referenced parents + * ------------------------------------------------ + * Before: + * +- Project [s#0.a AS s.a#1, s#0.a.a1 AS s.a.a1#2] + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- LocalRelation , [s#0] + * After: + * +- Project [_extract_a#3 AS s.a#1, _extract_a#3.name AS s.a.a1#2] + * +- GlobalLimit 5 + * +- LocalLimit 5 + * +- Project [s#0.a AS _extract_a#3] + * +- LocalRelation , [s#0] + * + * The schema of the datasource relation will be pruned in the [[SchemaPruning]] optimizer rule. */ object NestedColumnAliasing { def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { /** * This pattern is needed to support [[Filter]] plan cases like - * [[Project]]->[[Filter]]->listed plan in `canProjectPushThrough` (e.g., [[Window]]). - * The reason why we don't simply add [[Filter]] in `canProjectPushThrough` is that + * [[Project]]->[[Filter]]->listed plan in [[canProjectPushThrough]] (e.g., [[Window]]). + * The reason why we don't simply add [[Filter]] in [[canProjectPushThrough]] is that * the optimizer can hit an infinite loop during the [[PushDownPredicates]] rule. */ - case Project(projectList, Filter(condition, child)) - if SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => - val exprCandidatesToPrune = projectList ++ Seq(condition) ++ child.expressions - getAliasSubMap(exprCandidatesToPrune, child.producedAttributes.toSeq).map { - case (nestedFieldToAlias, attrToAliases) => - NestedColumnAliasing.replaceToAliases(plan, nestedFieldToAlias, attrToAliases) - } + case Project(projectList, Filter(condition, child)) if + SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => + rewritePlanIfSubsetFieldsUsed( + plan, projectList ++ Seq(condition) ++ child.expressions, child.producedAttributes.toSeq) - case Project(projectList, child) - if SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => - val exprCandidatesToPrune = projectList ++ child.expressions - getAliasSubMap(exprCandidatesToPrune, child.producedAttributes.toSeq).map { - case (nestedFieldToAlias, attrToAliases) => - NestedColumnAliasing.replaceToAliases(plan, nestedFieldToAlias, attrToAliases) - } + case Project(projectList, child) if + SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => + rewritePlanIfSubsetFieldsUsed( + plan, projectList ++ child.expressions, child.producedAttributes.toSeq) case p if SQLConf.get.nestedSchemaPruningEnabled && canPruneOn(p) => - val exprCandidatesToPrune = p.expressions - getAliasSubMap(exprCandidatesToPrune, p.producedAttributes.toSeq).map { - case (nestedFieldToAlias, attrToAliases) => - NestedColumnAliasing.replaceToAliases(p, nestedFieldToAlias, attrToAliases) - } + rewritePlanIfSubsetFieldsUsed( + plan, p.expressions, p.producedAttributes.toSeq) case _ => None } + /** + * Rewrites a plan with aliases if only a subset of the nested fields are used. + */ + def rewritePlanIfSubsetFieldsUsed( + plan: LogicalPlan, + exprList: Seq[Expression], + exclusiveAttrs: Seq[Attribute]): Option[LogicalPlan] = { + val attrToExtractValues = getAttributeToExtractValues(exprList, exclusiveAttrs) + if (attrToExtractValues.isEmpty) { + None + } else { + Some(rewritePlanWithAliases(plan, attrToExtractValues)) + } + } + /** * Replace nested columns to prune unused nested columns later. */ - private def replaceToAliases( + def rewritePlanWithAliases( plan: LogicalPlan, - nestedFieldToAlias: Map[ExtractValue, Alias], - attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = plan match { - case Project(projectList, child) => - Project( - getNewProjectList(projectList, nestedFieldToAlias), - replaceWithAliases(child, nestedFieldToAlias, attrToAliases)) - - // The operators reaching here was already guarded by `canPruneOn`. - case other => - replaceWithAliases(other, nestedFieldToAlias, attrToAliases) + attributeToExtractValues: Map[Attribute, Seq[ExtractValue]]): LogicalPlan = { + // Each expression can contain multiple nested fields. + // Note that we keep the original names to deliver to parquet in a case-sensitive way. + // A new alias is created for each nested field. + // Implementation detail: we don't use mapValues, because it creates a mutable view. + val attributeToExtractValuesAndAliases = + attributeToExtractValues.map { case (attr, evSeq) => + val evAliasSeq = evSeq.map { ev => + val fieldName = ev match { + case g: GetStructField => g.extractFieldName + case g: GetArrayStructFields => g.field.name + } + ev -> Alias(ev, s"_extract_$fieldName")() + } + + attr -> evAliasSeq + } + + val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap + + // A reference attribute can have multiple aliases for nested fields. + val attrToAliases = AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2))) + + plan match { + case Project(projectList, child) => + Project( + getNewProjectList(projectList, nestedFieldToAlias), + replaceWithAliases(child, nestedFieldToAlias, attrToAliases)) + + // The operators reaching here are already guarded by [[canPruneOn]]. + case other => + replaceWithAliases(other, nestedFieldToAlias, attrToAliases) + } } /** - * Return a replaced project list. + * Replace the [[ExtractValue]]s in a project list with aliased attributes. */ def getNewProjectList( projectList: Seq[NamedExpression], @@ -93,15 +173,15 @@ object NestedColumnAliasing { } /** - * Return a plan with new children replaced with aliases, and expressions replaced with - * aliased attributes. + * Replace the grandchildren of a plan with [[Project]]s of the nested fields as aliases, + * and replace the [[ExtractValue]] expressions with aliased attributes. */ def replaceWithAliases( plan: LogicalPlan, nestedFieldToAlias: Map[ExtractValue, Alias], - attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = { + attrToAliases: AttributeMap[Seq[Alias]]): LogicalPlan = { plan.withNewChildren(plan.children.map { plan => - Project(plan.output.flatMap(a => attrToAliases.getOrElse(a.exprId, Seq(a))), plan) + Project(plan.output.flatMap(a => attrToAliases.getOrElse(a, Seq(a))), plan) }).transformExpressions { case f: ExtractValue if nestedFieldToAlias.contains(f) => nestedFieldToAlias(f).toAttribute @@ -109,7 +189,7 @@ object NestedColumnAliasing { } /** - * Returns true for those operators that we can prune nested column on it. + * Returns true for operators on which we can prune nested columns. */ private def canPruneOn(plan: LogicalPlan) = plan match { case _: Aggregate => true @@ -118,7 +198,7 @@ object NestedColumnAliasing { } /** - * Returns true for those operators that project can be pushed through. + * Returns true for operators through which project can be pushed. */ private def canProjectPushThrough(plan: LogicalPlan) = plan match { case _: GlobalLimit => true @@ -133,9 +213,10 @@ object NestedColumnAliasing { } /** - * Return root references that are individually accessed as a whole, and `GetStructField`s - * or `GetArrayStructField`s which on top of other `ExtractValue`s or special expressions. - * Check `SelectedField` to see which expressions should be listed here. + * Returns two types of expressions: + * - Root references that are individually accessed + * - [[GetStructField]] or [[GetArrayStructFields]] on top of other [[ExtractValue]]s + * or special expressions. */ private def collectRootReferenceAndExtractValue(e: Expression): Seq[Expression] = e match { case _: AttributeReference => Seq(e) @@ -149,67 +230,55 @@ object NestedColumnAliasing { } /** - * Return two maps in order to replace nested fields to aliases. - * - * If `exclusiveAttrs` is given, any nested field accessors of these attributes - * won't be considered in nested fields aliasing. - * - * 1. ExtractValue -> Alias: A new alias is created for each nested field. - * 2. ExprId -> Seq[Alias]: A reference attribute has multiple aliases pointing it. + * Creates a map from root [[Attribute]]s to non-redundant nested [[ExtractValue]]s. + * Nested field accessors of `exclusiveAttrs` are not considered in nested fields aliasing. */ - def getAliasSubMap(exprList: Seq[Expression], exclusiveAttrs: Seq[Attribute] = Seq.empty) - : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = { - val (nestedFieldReferences, otherRootReferences) = - exprList.flatMap(collectRootReferenceAndExtractValue).partition { - case _: ExtractValue => true - case _ => false + def getAttributeToExtractValues( + exprList: Seq[Expression], + exclusiveAttrs: Seq[Attribute]): Map[Attribute, Seq[ExtractValue]] = { + + val nestedFieldReferences = new mutable.ArrayBuffer[ExtractValue]() + val otherRootReferences = new mutable.ArrayBuffer[AttributeReference]() + exprList.foreach { e => + collectRootReferenceAndExtractValue(e).foreach { + case ev: ExtractValue => + if (ev.references.size == 1) { + nestedFieldReferences.append(ev) + } + case ar: AttributeReference => otherRootReferences.append(ar) } - - // Note that when we group by extractors with their references, we should remove - // cosmetic variations. + } val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences) - val aliasSub = nestedFieldReferences.asInstanceOf[Seq[ExtractValue]] + + // Remove cosmetic variations when we group extractors by their references + nestedFieldReferences .filter(!_.references.subsetOf(exclusiveAttrSet)) .groupBy(_.references.head.canonicalized.asInstanceOf[Attribute]) - .flatMap { case (attr, nestedFields: Seq[ExtractValue]) => - // Remove redundant `ExtractValue`s if they share the same parent nest field. + .flatMap { case (attr: Attribute, nestedFields: Seq[ExtractValue]) => + // Remove redundant [[ExtractValue]]s if they share the same parent nest field. // For example, when `a.b` and `a.b.c` are in project list, we only need to alias `a.b`. - // We only need to deal with two `ExtractValue`: `GetArrayStructFields` and - // `GetStructField`. Please refer to the method `collectRootReferenceAndExtractValue`. + // Because `a.b` requires all of the inner fields of `b`, we cannot prune `a.b.c`. val dedupNestedFields = nestedFields.filter { + // See [[collectExtractValue]]: we only need to deal with [[GetArrayStructFields]] and + // [[GetStructField]] case e @ (_: GetStructField | _: GetArrayStructFields) => val child = e.children.head nestedFields.forall(f => child.find(_.semanticEquals(f)).isEmpty) case _ => true - } - - // Each expression can contain multiple nested fields. - // Note that we keep the original names to deliver to parquet in a case-sensitive way. - val nestedFieldToAlias = dedupNestedFields.distinct.map { f => - val exprId = NamedExpression.newExprId - (f, Alias(f, s"_gen_alias_${exprId.id}")(exprId, Seq.empty, None)) - } + }.distinct // If all nested fields of `attr` are used, we don't need to introduce new aliases. - // By default, ColumnPruning rule uses `attr` already. + // By default, the [[ColumnPruning]] rule uses `attr` already. // Note that we need to remove cosmetic variations first, so we only count a // nested field once. - if (nestedFieldToAlias.nonEmpty && - dedupNestedFields.map(_.canonicalized) - .distinct - .map { nestedField => totalFieldNum(nestedField.dataType) } - .sum < totalFieldNum(attr.dataType)) { - Some(attr.exprId -> nestedFieldToAlias) + val numUsedNestedFields = dedupNestedFields.map(_.canonicalized).distinct + .map { nestedField => totalFieldNum(nestedField.dataType) }.sum + if (numUsedNestedFields < totalFieldNum(attr.dataType)) { + Some((attr, dedupNestedFields.toSeq)) } else { None } } - - if (aliasSub.isEmpty) { - None - } else { - Some((aliasSub.values.flatten.toMap, aliasSub.map(x => (x._1, x._2.map(_._2))))) - } } /** @@ -227,31 +296,9 @@ object NestedColumnAliasing { } /** - * This prunes unnecessary nested columns from `Generate` and optional `Project` on top - * of it. + * This prunes unnecessary nested columns from [[Generate]], or [[Project]] -> [[Generate]] */ object GeneratorNestedColumnAliasing { - // Partitions `attrToAliases` based on whether the attribute is in Generator's output. - private def aliasesOnGeneratorOutput( - attrToAliases: Map[ExprId, Seq[Alias]], - generatorOutput: Seq[Attribute]) = { - val generatorOutputExprId = generatorOutput.map(_.exprId) - attrToAliases.partition { k => - generatorOutputExprId.contains(k._1) - } - } - - // Partitions `nestedFieldToAlias` based on whether the attribute of nested field extractor - // is in Generator's output. - private def nestedFieldOnGeneratorOutput( - nestedFieldToAlias: Map[ExtractValue, Alias], - generatorOutput: Seq[Attribute]) = { - val generatorOutputSet = AttributeSet(generatorOutput) - nestedFieldToAlias.partition { pair => - pair._1.references.subsetOf(generatorOutputSet) - } - } - def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { // Either `nestedPruningOnExpressions` or `nestedSchemaPruningEnabled` is enabled, we // need to prune nested columns through Project and under Generate. The difference is @@ -261,103 +308,100 @@ object GeneratorNestedColumnAliasing { SQLConf.get.nestedSchemaPruningEnabled) && canPruneGenerator(g.generator) => // On top on `Generate`, a `Project` that might have nested column accessors. // We try to get alias maps for both project list and generator's children expressions. - val exprsToPrune = projectList ++ g.generator.children - NestedColumnAliasing.getAliasSubMap(exprsToPrune).map { - case (nestedFieldToAlias, attrToAliases) => - val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) = - nestedFieldOnGeneratorOutput(nestedFieldToAlias, g.qualifiedGeneratorOutput) - val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) = - aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput) - - // Push nested column accessors through `Generator`. - // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. - val newChild = NestedColumnAliasing.replaceWithAliases(g, - nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator) - val pushedThrough = Project(NestedColumnAliasing - .getNewProjectList(projectList, nestedFieldsNotOnGenerator), newChild) - - // If the generator output is `ArrayType`, we cannot push through the extractor. - // It is because we don't allow field extractor on two-level array, - // i.e., attr.field when attr is a ArrayType(ArrayType(...)). - // Similarily, we also cannot push through if the child of generator is `MapType`. - g.generator.children.head.dataType match { - case _: MapType => return Some(pushedThrough) - case ArrayType(_: ArrayType, _) => return Some(pushedThrough) - case _ => - } - - // Pruning on `Generator`'s output. We only process single field case. - // For multiple field case, we cannot directly move field extractor into - // the generator expression. A workaround is to re-construct array of struct - // from multiple fields. But it will be more complicated and may not worth. - // TODO(SPARK-34956): support multiple fields. - if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.isEmpty) { - pushedThrough - } else { - // Only one nested column accessor. - // E.g., df.select(explode($"items").as("item")).select($"item.a") - pushedThrough match { - case p @ Project(_, newG: Generate) => - // Replace the child expression of `ExplodeBase` generator with - // nested column accessor. - // E.g., df.select(explode($"items").as("item")).select($"item.a") => - // df.select(explode($"items.a").as("item.a")) - val rewrittenG = newG.transformExpressions { - case e: ExplodeBase => - val extractor = nestedFieldsOnGenerator.head._1.transformUp { - case _: Attribute => - e.child - case g: GetStructField => - ExtractValue(g.child, Literal(g.extractFieldName), SQLConf.get.resolver) - } - e.withNewChildren(Seq(extractor)) - } + val attrToExtractValues = NestedColumnAliasing.getAttributeToExtractValues( + projectList ++ g.generator.children, Seq.empty) + if (attrToExtractValues.isEmpty) { + return None + } + val generatorOutputSet = AttributeSet(g.qualifiedGeneratorOutput) + val (attrToExtractValuesOnGenerator, attrToExtractValuesNotOnGenerator) = + attrToExtractValues.partition { case (attr, _) => + attr.references.subsetOf(generatorOutputSet) } + + val pushedThrough = NestedColumnAliasing.rewritePlanWithAliases( + plan, attrToExtractValuesNotOnGenerator) + + // If the generator output is `ArrayType`, we cannot push through the extractor. + // It is because we don't allow field extractor on two-level array, + // i.e., attr.field when attr is a ArrayType(ArrayType(...)). + // Similarily, we also cannot push through if the child of generator is `MapType`. + g.generator.children.head.dataType match { + case _: MapType => return Some(pushedThrough) + case ArrayType(_: ArrayType, _) => return Some(pushedThrough) + case _ => + } - // As we change the child of the generator, its output data type must be updated. - val updatedGeneratorOutput = rewrittenG.generatorOutput - .zip(rewrittenG.generator.elementSchema.toAttributes) - .map { case (oldAttr, newAttr) => - newAttr.withExprId(oldAttr.exprId).withName(oldAttr.name) - } - assert(updatedGeneratorOutput.length == rewrittenG.generatorOutput.length, - "Updated generator output must have the same length " + - "with original generator output.") - val updatedGenerate = rewrittenG.copy(generatorOutput = updatedGeneratorOutput) - - // Replace nested column accessor with generator output. - p.withNewChildren(Seq(updatedGenerate)).transformExpressions { - case f: ExtractValue if nestedFieldsOnGenerator.contains(f) => - updatedGenerate.output - .find(a => attrToAliasesOnGenerator.contains(a.exprId)) - .getOrElse(f) + // Pruning on `Generator`'s output. We only process single field case. + // For multiple field case, we cannot directly move field extractor into + // the generator expression. A workaround is to re-construct array of struct + // from multiple fields. But it will be more complicated and may not worth. + // TODO(SPARK-34956): support multiple fields. + val nestedFieldsOnGenerator = attrToExtractValuesOnGenerator.values.flatten.toSet + if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.isEmpty) { + Some(pushedThrough) + } else { + // Only one nested column accessor. + // E.g., df.select(explode($"items").as("item")).select($"item.a") + val nestedFieldOnGenerator = nestedFieldsOnGenerator.head + pushedThrough match { + case p @ Project(_, newG: Generate) => + // Replace the child expression of `ExplodeBase` generator with + // nested column accessor. + // E.g., df.select(explode($"items").as("item")).select($"item.a") => + // df.select(explode($"items.a").as("item.a")) + val rewrittenG = newG.transformExpressions { + case e: ExplodeBase => + val extractor = nestedFieldOnGenerator.transformUp { + case _: Attribute => + e.child + case g: GetStructField => + ExtractValue(g.child, Literal(g.extractFieldName), SQLConf.get.resolver) } + e.withNewChildren(Seq(extractor)) + } - case other => - // We should not reach here. - throw new IllegalStateException(s"Unreasonable plan after optimization: $other") + // As we change the child of the generator, its output data type must be updated. + val updatedGeneratorOutput = rewrittenG.generatorOutput + .zip(rewrittenG.generator.elementSchema.toAttributes) + .map { case (oldAttr, newAttr) => + newAttr.withExprId(oldAttr.exprId).withName(oldAttr.name) + } + assert(updatedGeneratorOutput.length == rewrittenG.generatorOutput.length, + "Updated generator output must have the same length " + + "with original generator output.") + val updatedGenerate = rewrittenG.copy(generatorOutput = updatedGeneratorOutput) + + // Replace nested column accessor with generator output. + val attrExprIdsOnGenerator = attrToExtractValuesOnGenerator.keys.map(_.exprId).toSet + val updatedProject = p.withNewChildren(Seq(updatedGenerate)).transformExpressions { + case f: ExtractValue if nestedFieldsOnGenerator.contains(f) => + updatedGenerate.output + .find(a => attrExprIdsOnGenerator.contains(a.exprId)) + .getOrElse(f) } - } + Some(updatedProject) + + case other => + // We should not reach here. + throw new IllegalStateException(s"Unreasonable plan after optimization: $other") + } } case g: Generate if SQLConf.get.nestedSchemaPruningEnabled && - canPruneGenerator(g.generator) => + canPruneGenerator(g.generator) => // If any child output is required by higher projection, we cannot prune on it even we // only use part of nested column of it. A required child output means it is referred // as a whole or partially by higher projection, pruning it here will cause unresolved // query plan. - NestedColumnAliasing.getAliasSubMap( - g.generator.children, g.requiredChildOutput).map { - case (nestedFieldToAlias, attrToAliases) => - // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. - NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, attrToAliases) - } + NestedColumnAliasing.rewritePlanIfSubsetFieldsUsed( + plan, g.generator.children, g.requiredChildOutput) case _ => None } /** - * This is a while-list for pruning nested fields at `Generator`. + * Types of [[Generator]] on which we can prune nested fields. */ def canPruneGenerator(g: Generator): Boolean = g match { case _: Explode => true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 16e3e43356b9c..c90f4bcdd2602 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -785,7 +785,7 @@ object ColumnPruning extends Rule[LogicalPlan] { p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices)) // prune unrequired nested fields from `Generate`. - case GeneratorNestedColumnAliasing(p) => p + case GeneratorNestedColumnAliasing(rewrittenPlan) => rewrittenPlan // Eliminate unneeded attributes from right side of a Left Existence Join. case j @ Join(_, right, LeftExistence(_), _, _) => @@ -819,7 +819,7 @@ object ColumnPruning extends Rule[LogicalPlan] { // Can't prune the columns on LeafNode case p @ Project(_, _: LeafNode) => p - case NestedColumnAliasing(p) => p + case NestedColumnAliasing(rewrittenPlan) => rewrittenPlan // for all other logical plans that inherits the output from it's children // Project over project is handled by the first case, skip it here. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala index a856caa6781e8..643974c9c707d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala @@ -714,7 +714,7 @@ object NestedColumnAliasingSuite { def collectGeneratedAliases(query: LogicalPlan): ArrayBuffer[String] = { val aliases = ArrayBuffer[String]() query.transformAllExpressions { - case a @ Alias(_, name) if name.startsWith("_gen_alias_") => + case a @ Alias(_, name) if name.startsWith("_extract_") => aliases += name a } From 0a2edadedfbcae8b9ed6ff4f04ec9cd39bf50ba7 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 29 May 2021 00:11:16 +0900 Subject: [PATCH 18/30] [SPARK-35194][SQL][FOLLOWUP] Recover build error with Scala 2.13 on GA ### What changes were proposed in this pull request? This PR fixes a build error with Scala 2.13 on GA. #32301 seems to bring this error. ### Why are the changes needed? To recover CI. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? GA Closes #32696 from sarutak/followup-SPARK-35194. Authored-by: Kousuke Saruta Signed-off-by: Kousuke Saruta --- .../spark/sql/catalyst/optimizer/NestedColumnAliasing.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index cd7032d555992..e0e8f926019f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -146,7 +146,8 @@ object NestedColumnAliasing { val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap // A reference attribute can have multiple aliases for nested fields. - val attrToAliases = AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2))) + val attrToAliases = + AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2)).toSeq) plan match { case Project(projectList, child) => From b9e4faf2ed6b1d7e79af07ae3409e9a50f6844c2 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 27 May 2021 12:33:30 +0900 Subject: [PATCH 19/30] [SPARK-35098][PYTHON] Re-enable pandas-on-Spark test cases Re-enable some pandas-on-Spark test cases. pandas version in GitHub Actions is upgraded now so we can re-enable some pandas-on-Spark test cases. No. Unit tests. Closes #32682 from xinrong-databricks/enable_tests. Authored-by: Xinrong Meng Signed-off-by: Hyukjin Kwon --- .../pyspark/pandas/tests/indexes/test_base.py | 1546 ++++++------ python/pyspark/pandas/tests/test_series.py | 2110 +++++++++-------- 2 files changed, 1840 insertions(+), 1816 deletions(-) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index a0eb243a6c56a..b6d4182825ee6 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -43,7 +43,7 @@ def pdf(self): ) @property - def kdf(self): + def psdf(self): return ps.from_pandas(self.pdf) def test_index_basic(self): @@ -61,55 +61,55 @@ def test_index_basic(self): pd.DataFrame(np.random.randn(10, 5), index=pd.Categorical(list("abcdefghij"))), pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")).set_index(["a", "b"]), ]: - kdf = ps.from_pandas(pdf) - self.assert_eq(kdf.index, pdf.index) - self.assert_eq(type(kdf.index).__name__, type(pdf.index).__name__) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.index, pdf.index) + self.assert_eq(type(psdf.index).__name__, type(pdf.index).__name__) def test_index_from_series(self): pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(ps.Index(kser), pd.Index(pser)) - self.assert_eq(ps.Index(kser, dtype="float"), pd.Index(pser, dtype="float")) - self.assert_eq(ps.Index(kser, name="x"), pd.Index(pser, name="x")) + self.assert_eq(ps.Index(psser), pd.Index(pser)) + self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float")) + self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x")) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(ps.Int64Index(kser), pd.Int64Index(pser)) - self.assert_eq(ps.Float64Index(kser), pd.Float64Index(pser)) + self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser)) + self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser)) else: - self.assert_eq(ps.Int64Index(kser), pd.Int64Index(pser).rename("a")) - self.assert_eq(ps.Float64Index(kser), pd.Float64Index(pser).rename("a")) + self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser).rename("a")) + self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser).rename("a")) pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(ps.Index(kser), pd.Index(pser)) - self.assert_eq(ps.DatetimeIndex(kser), pd.DatetimeIndex(pser)) + self.assert_eq(ps.Index(psser), pd.Index(pser)) + self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser)) def test_index_from_index(self): pidx = pd.Index([1, 2, 3], name="a") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(ps.Index(kidx), pd.Index(pidx)) - self.assert_eq(ps.Index(kidx, dtype="float"), pd.Index(pidx, dtype="float")) - self.assert_eq(ps.Index(kidx, name="x"), pd.Index(pidx, name="x")) + self.assert_eq(ps.Index(psidx), pd.Index(pidx)) + self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float")) + self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x")) - self.assert_eq(ps.Int64Index(kidx), pd.Int64Index(pidx)) - self.assert_eq(ps.Float64Index(kidx), pd.Float64Index(pidx)) + self.assert_eq(ps.Int64Index(psidx), pd.Int64Index(pidx)) + self.assert_eq(ps.Float64Index(psidx), pd.Float64Index(pidx)) pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(ps.Index(kidx), pd.Index(pidx)) - self.assert_eq(ps.DatetimeIndex(kidx), pd.DatetimeIndex(pidx)) + self.assert_eq(ps.Index(psidx), pd.Index(pidx)) + self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx)) def test_index_getattr(self): - kidx = self.kdf.index + psidx = self.psdf.index item = "databricks" expected_error_message = "'.*Index' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): - kidx.__getattr__(item) + psidx.__getattr__(item) with self.assertRaisesRegex(AttributeError, expected_error_message): ps.from_pandas(pd.date_range("2011-01-01", freq="D", periods=10)).__getattr__(item) @@ -117,148 +117,148 @@ def test_multi_index_getattr(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - kdf = ps.from_pandas(pdf) - kidx = kdf.index + psdf = ps.from_pandas(pdf) + psidx = psdf.index item = "databricks" expected_error_message = "'MultiIndex' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): - kidx.__getattr__(item) + psidx.__getattr__(item) def test_to_series(self): pidx = self.pdf.index - kidx = self.kdf.index + psidx = self.psdf.index - self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) # With name pidx.name = "Koalas" - kidx.name = "Koalas" - self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) + psidx.name = "Koalas" + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) # With tupled name pidx.name = ("x", "a") - kidx.name = ("x", "a") - self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) + psidx.name = ("x", "a") + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) - self.assert_eq((kidx + 1).to_series(), (pidx + 1).to_series()) + self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series()) pidx = self.pdf.set_index("b", append=True).index - kidx = self.kdf.set_index("b", append=True).index + psidx = self.psdf.set_index("b", append=True).index with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): - self.assert_eq(kidx.to_series(), pidx.to_series()) - self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - kidx.to_series(name=["x", "a"]) + psidx.to_series(name=["x", "a"]) def test_to_frame(self): pidx = self.pdf.index - kidx = self.kdf.index + psidx = self.psdf.index - self.assert_eq(kidx.to_frame(), pidx.to_frame()) - self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) pidx.name = "a" - kidx.name = "a" + psidx.name = "a" - self.assert_eq(kidx.to_frame(), pidx.to_frame()) - self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(kidx.to_frame(name="x"), pidx.to_frame(name="x")) + self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) self.assert_eq( - kidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"), + psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"), ) - self.assertRaises(TypeError, lambda: kidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) # non-string name - self.assert_eq(kidx.to_frame(name=10), pidx.to_frame(name=10)) - self.assert_eq(kidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) pidx = self.pdf.set_index("b", append=True).index - kidx = self.kdf.set_index("b", append=True).index + psidx = self.psdf.set_index("b", append=True).index - self.assert_eq(kidx.to_frame(), pidx.to_frame()) - self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(kidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) - self.assert_eq(kidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) + self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) + self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) self.assert_eq( - kidx.to_frame(index=False, name=["x", "y"]), + psidx.to_frame(index=False, name=["x", "y"]), pidx.to_frame(index=False, name=["x", "y"]), ) - self.assertRaises(TypeError, lambda: kidx.to_frame(name="x")) - self.assertRaises(ValueError, lambda: kidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) + self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) # non-string names - self.assert_eq(kidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) - self.assert_eq(kidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) self.assert_eq( - kidx.to_frame(name=[("x", 10), ("y", 20)]), + psidx.to_frame(name=[("x", 10), ("y", 20)]), pidx.to_frame(name=[("x", 10), ("y", 20)]), ) def test_index_names(self): - kdf = self.kdf - self.assertIsNone(kdf.index.name) + psdf = self.psdf + self.assertIsNone(psdf.index.name) idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") pdf = pd.DataFrame(np.random.randn(10, 5), index=idx, columns=list("abcde")) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.a - kser = kdf.a + psser = psdf.a - self.assertEqual(kdf.index.name, pdf.index.name) - self.assertEqual(kdf.index.names, pdf.index.names) + self.assertEqual(psdf.index.name, pdf.index.name) + self.assertEqual(psdf.index.names, pdf.index.names) pidx = pdf.index - kidx = kdf.index + psidx = psdf.index pidx.name = "renamed" - kidx.name = "renamed" - self.assertEqual(kidx.name, pidx.name) - self.assertEqual(kidx.names, pidx.names) - self.assert_eq(kidx, pidx) - self.assertEqual(kdf.index.name, pdf.index.name) - self.assertEqual(kdf.index.names, pdf.index.names) - self.assertEqual(kser.index.names, pser.index.names) + psidx.name = "renamed" + self.assertEqual(psidx.name, pidx.name) + self.assertEqual(psidx.names, pidx.names) + self.assert_eq(psidx, pidx) + self.assertEqual(psdf.index.name, pdf.index.name) + self.assertEqual(psdf.index.names, pdf.index.names) + self.assertEqual(psser.index.names, pser.index.names) pidx.name = None - kidx.name = None - self.assertEqual(kidx.name, pidx.name) - self.assertEqual(kidx.names, pidx.names) - self.assert_eq(kidx, pidx) - self.assertEqual(kdf.index.name, pdf.index.name) - self.assertEqual(kdf.index.names, pdf.index.names) - self.assertEqual(kser.index.names, pser.index.names) + psidx.name = None + self.assertEqual(psidx.name, pidx.name) + self.assertEqual(psidx.names, pidx.names) + self.assert_eq(psidx, pidx) + self.assertEqual(psdf.index.name, pdf.index.name) + self.assertEqual(psdf.index.names, pdf.index.names) + self.assertEqual(psser.index.names, pser.index.names) with self.assertRaisesRegex(ValueError, "Names must be a list-like"): - kidx.names = "hi" + psidx.names = "hi" expected_error_message = "Length of new names must be {}, got {}".format( - kdf._internal.index_level, len(["0", "1"]) + psdf._internal.index_level, len(["0", "1"]) ) with self.assertRaisesRegex(ValueError, expected_error_message): - kidx.names = ["0", "1"] + psidx.names = ["0", "1"] expected_error_message = "Index.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): ps.Index([1, 2, 3], name=["0", "1"]) with self.assertRaisesRegex(TypeError, expected_error_message): - kidx.name = ["renamed"] + psidx.name = ["renamed"] with self.assertRaisesRegex(TypeError, expected_error_message): - kidx.name = ["0", "1"] + psidx.name = ["0", "1"] with self.assertRaisesRegex(TypeError, expected_error_message): ps.Index([(1, 2), (3, 4)], names=["a", ["b"]]) @@ -266,143 +266,143 @@ def test_multi_index_names(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.names, pdf.index.names) + self.assertEqual(psdf.index.names, pdf.index.names) pidx = pdf.index - kidx = kdf.index + psidx = psdf.index pidx.names = ["renamed_number", "renamed_color"] - kidx.names = ["renamed_number", "renamed_color"] - self.assertEqual(kidx.names, pidx.names) + psidx.names = ["renamed_number", "renamed_color"] + self.assertEqual(psidx.names, pidx.names) pidx.names = ["renamed_number", None] - kidx.names = ["renamed_number", None] - self.assertEqual(kidx.names, pidx.names) - self.assert_eq(kidx, pidx) + psidx.names = ["renamed_number", None] + self.assertEqual(psidx.names, pidx.names) + self.assert_eq(psidx, pidx) with self.assertRaises(PandasNotImplementedError): - kidx.name + psidx.name with self.assertRaises(PandasNotImplementedError): - kidx.name = "renamed" + psidx.name = "renamed" def test_index_rename(self): pdf = pd.DataFrame( np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") ) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pidx = pdf.index - kidx = kdf.index + psidx = psdf.index - self.assert_eq(kidx.rename("y"), pidx.rename("y")) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psidx.rename("y"), pidx.rename("y")) + self.assert_eq(psdf.index.names, pdf.index.names) # non-string names - self.assert_eq(kidx.rename(0), pidx.rename(0)) - self.assert_eq(kidx.rename(("y", 0)), pidx.rename(("y", 0))) + self.assert_eq(psidx.rename(0), pidx.rename(0)) + self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0))) - kidx.rename("z", inplace=True) + psidx.rename("z", inplace=True) pidx.rename("z", inplace=True) - self.assert_eq(kidx, pidx) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psidx, pidx) + self.assert_eq(psdf.index.names, pdf.index.names) - self.assert_eq(kidx.rename(None), pidx.rename(None)) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psidx.rename(None), pidx.rename(None)) + self.assert_eq(psdf.index.names, pdf.index.names) - self.assertRaises(TypeError, lambda: kidx.rename(["x", "y"])) + self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"])) def test_multi_index_rename(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pmidx = pdf.index - kmidx = kdf.index + psmidx = psdf.index - self.assert_eq(kmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) + self.assert_eq(psdf.index.names, pdf.index.names) # non-string names - self.assert_eq(kmidx.rename([0, 1]), pmidx.rename([0, 1])) + self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1])) self.assert_eq( - kmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) + psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) ) - kmidx.rename(["num", "col"], inplace=True) + psmidx.rename(["num", "col"], inplace=True) pmidx.rename(["num", "col"], inplace=True) - self.assert_eq(kmidx, pmidx) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psmidx, pmidx) + self.assert_eq(psdf.index.names, pdf.index.names) - self.assert_eq(kmidx.rename([None, None]), pmidx.rename([None, None])) - self.assert_eq(kdf.index.names, pdf.index.names) + self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None])) + self.assert_eq(psdf.index.names, pdf.index.names) - self.assertRaises(TypeError, lambda: kmidx.rename("number")) - self.assertRaises(TypeError, lambda: kmidx.rename(None)) - self.assertRaises(ValueError, lambda: kmidx.rename(["number"])) + self.assertRaises(TypeError, lambda: psmidx.rename("number")) + self.assertRaises(TypeError, lambda: psmidx.rename(None)) + self.assertRaises(ValueError, lambda: psmidx.rename(["number"])) def test_multi_index_levshape(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - kidx = ps.from_pandas(pidx) - self.assertEqual(pidx.levshape, kidx.levshape) + psidx = ps.from_pandas(pidx) + self.assertEqual(pidx.levshape, psidx.levshape) def test_index_unique(self): - kidx = self.kdf.index + psidx = self.psdf.index # here the output is different than pandas in terms of order expected = [0, 1, 3, 5, 6, 8, 9] - self.assert_eq(expected, sorted(kidx.unique().to_pandas())) - self.assert_eq(expected, sorted(kidx.unique(level=0).to_pandas())) + self.assert_eq(expected, sorted(psidx.unique().to_pandas())) + self.assert_eq(expected, sorted(psidx.unique(level=0).to_pandas())) expected = [1, 2, 4, 6, 7, 9, 10] - self.assert_eq(expected, sorted((kidx + 1).unique().to_pandas())) + self.assert_eq(expected, sorted((psidx + 1).unique().to_pandas())) with self.assertRaisesRegex(IndexError, "Too many levels*"): - kidx.unique(level=1) + psidx.unique(level=1) with self.assertRaisesRegex(KeyError, "Requested level (hi)*"): - kidx.unique(level="hi") + psidx.unique(level="hi") def test_multi_index_copy(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(kdf.index.copy(), pdf.index.copy()) + self.assert_eq(psdf.index.copy(), pdf.index.copy()) def test_drop_duplicates(self): pidx = pd.Index([4, 2, 4, 1, 4, 3]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.drop_duplicates().sort_values(), pidx.drop_duplicates().sort_values()) + self.assert_eq(psidx.drop_duplicates().sort_values(), pidx.drop_duplicates().sort_values()) self.assert_eq( - (kidx + 1).drop_duplicates().sort_values(), (pidx + 1).drop_duplicates().sort_values() + (psidx + 1).drop_duplicates().sort_values(), (pidx + 1).drop_duplicates().sort_values() ) def test_dropna(self): pidx = pd.Index([np.nan, 2, 4, 1, np.nan, 3]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.dropna(), pidx.dropna()) - self.assert_eq((kidx + 1).dropna(), (pidx + 1).dropna()) + self.assert_eq(psidx.dropna(), pidx.dropna()) + self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna()) def test_index_symmetric_difference(self): pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([2, 3, 4, 5]) - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) self.assert_eq( - kidx1.symmetric_difference(kidx2).sort_values(), + psidx1.symmetric_difference(psidx2).sort_values(), pidx1.symmetric_difference(pidx2).sort_values(), ) self.assert_eq( - (kidx1 + 1).symmetric_difference(kidx2).sort_values(), + (psidx1 + 1).symmetric_difference(psidx2).sort_values(), (pidx1 + 1).symmetric_difference(pidx2).sort_values(), ) @@ -414,11 +414,11 @@ def test_index_symmetric_difference(self): [["koalas", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) - kmidx1 = ps.from_pandas(pmidx1) - kmidx2 = ps.from_pandas(pmidx2) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) self.assert_eq( - kmidx1.symmetric_difference(kmidx2).sort_values(), + psmidx1.symmetric_difference(psmidx2).sort_values(), pmidx1.symmetric_difference(pmidx2).sort_values(), ) @@ -442,7 +442,7 @@ def test_multi_index_symmetric_difference(self): midx.symmetric_difference(idx) def test_missing(self): - kdf = ps.DataFrame( + psdf = ps.DataFrame( { "a": [1, 2, 3], "b": [4, 5, 6], @@ -461,7 +461,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("a").index, name)() + getattr(psdf.set_index("a").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -470,7 +470,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index("a").index, name)() + getattr(psdf.set_index("a").index, name)() # MultiIndex functions missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction) @@ -482,7 +482,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index(["a", "b"]).index, name)() + getattr(psdf.set_index(["a", "b"]).index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -491,7 +491,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index(["a", "b"]).index, name)() + getattr(psdf.set_index(["a", "b"]).index, name)() # DatetimeIndex functions missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction) @@ -503,7 +503,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("c").index, name)() + getattr(psdf.set_index("c").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -512,7 +512,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index("c").index, name)() + getattr(psdf.set_index("c").index, name)() # CategoricalIndex functions missing_functions = inspect.getmembers( @@ -526,7 +526,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("d").index, name)() + getattr(psdf.set_index("d").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -535,7 +535,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index("d").index, name)() + getattr(psdf.set_index("d").index, name)() # Index properties missing_properties = inspect.getmembers( @@ -551,7 +551,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("a").index, name) + getattr(psdf.set_index("a").index, name) deprecated_properties = [ name @@ -562,7 +562,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index("a").index, name) + getattr(psdf.set_index("a").index, name) # MultiIndex properties missing_properties = inspect.getmembers( @@ -578,7 +578,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index(["a", "b"]).index, name) + getattr(psdf.set_index(["a", "b"]).index, name) deprecated_properties = [ name @@ -589,7 +589,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) ): - getattr(kdf.set_index(["a", "b"]).index, name) + getattr(psdf.set_index(["a", "b"]).index, name) # DatetimeIndex properties missing_properties = inspect.getmembers( @@ -605,7 +605,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("c").index, name) + getattr(psdf.set_index("c").index, name) # CategoricalIndex properties missing_properties = inspect.getmembers( @@ -621,7 +621,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kdf.set_index("d").index, name) + getattr(psdf.set_index("d").index, name) def test_index_has_duplicates(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] @@ -630,9 +630,9 @@ def test_index_has_duplicates(self): for idx, name, expected in zip(indexes, names, has_dup): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name)) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.has_duplicates, expected) + self.assertEqual(psdf.index.has_duplicates, expected) def test_multiindex_has_duplicates(self): indexes = [ @@ -645,170 +645,172 @@ def test_multiindex_has_duplicates(self): for idx, expected in zip(indexes, has_dup): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.has_duplicates, expected) + self.assertEqual(psdf.index.has_duplicates, expected) def test_multi_index_not_supported(self): - kdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"): - kdf.set_index(["a", "b"]).index.any() + psdf.set_index(["a", "b"]).index.any() with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"): - kdf.set_index(["a", "b"]).index.all() + psdf.set_index(["a", "b"]).index.all() def test_index_nlevels(self): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.nlevels, 1) + self.assertEqual(psdf.index.nlevels, 1) def test_multiindex_nlevel(self): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")]) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.nlevels, 2) + self.assertEqual(psdf.index.nlevels, 2) def test_multiindex_from_arrays(self): arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]] pidx = pd.MultiIndex.from_arrays(arrays) - kidx = ps.MultiIndex.from_arrays(arrays) + psidx = ps.MultiIndex.from_arrays(arrays) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) def test_multiindex_swaplevel(self): pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(-2, -1), kidx.swaplevel(-2, -1)) - self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) - self.assert_eq(pidx.swaplevel("word", 1), kidx.swaplevel("word", 1)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1)) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1)) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - kidx.swaplevel(-3, "word") + psidx.swaplevel(-3, "word") with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - kidx.swaplevel(0, 2) + psidx.swaplevel(0, 2) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - kidx.swaplevel(0, -3) + psidx.swaplevel(0, -3) with self.assertRaisesRegex(KeyError, "Level work not found"): - kidx.swaplevel(0, "work") + psidx.swaplevel(0, "work") def test_multiindex_droplevel(self): pidx = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"] ) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"): - kidx.droplevel(4) + psidx.droplevel(4) with self.assertRaisesRegex(KeyError, "Level level4 not found"): - kidx.droplevel("level4") + psidx.droplevel("level4") with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"): - kidx.droplevel([("level3", "level4")]) + psidx.droplevel([("level3", "level4")]) with self.assertRaisesRegex( ValueError, "Cannot remove 4 levels from an index with 3 levels: at least one " "level must be left.", ): - kidx.droplevel([0, 0, 1, 2]) + psidx.droplevel([0, 0, 1, 2]) with self.assertRaisesRegex( ValueError, "Cannot remove 3 levels from an index with 3 levels: at least one " "level must be left.", ): - kidx.droplevel([0, 1, 2]) + psidx.droplevel([0, 1, 2]) - self.assert_eq(pidx.droplevel(0), kidx.droplevel(0)) - self.assert_eq(pidx.droplevel([0, 1]), kidx.droplevel([0, 1])) - self.assert_eq(pidx.droplevel((0, 1)), kidx.droplevel((0, 1))) - self.assert_eq(pidx.droplevel([0, "level2"]), kidx.droplevel([0, "level2"])) - self.assert_eq(pidx.droplevel((0, "level2")), kidx.droplevel((0, "level2"))) + self.assert_eq(pidx.droplevel(0), psidx.droplevel(0)) + self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1])) + self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1))) + self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"])) + self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2"))) # non-string names pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.droplevel(1.0), kidx.droplevel(1.0)) - self.assert_eq(pidx.droplevel([0, 2.0]), kidx.droplevel([0, 2.0])) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0)) + self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0])) def test_index_fillna(self): pidx = pd.Index([1, 2, None]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.fillna(0), kidx.fillna(0), almost=True) - self.assert_eq(pidx.rename("name").fillna(0), kidx.rename("name").fillna(0), almost=True) + self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True) + self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True) with self.assertRaisesRegex(TypeError, "Unsupported type list"): - kidx.fillna([1, 2]) + psidx.fillna([1, 2]) def test_index_drop(self): pidx = pd.Index([1, 2, 3]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop(1), kidx.drop(1)) - self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2])) - self.assert_eq((pidx + 1).drop([2, 3]), (kidx + 1).drop([2, 3])) + self.assert_eq(pidx.drop(1), psidx.drop(1)) + self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2])) + self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3])) def test_multiindex_drop(self): pidx = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] ) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop("a"), kidx.drop("a")) - self.assert_eq(pidx.drop(["a", "b"]), kidx.drop(["a", "b"])) - self.assert_eq(pidx.drop(["x", "y"], level=1), kidx.drop(["x", "y"], level=1)) - self.assert_eq(pidx.drop(["x", "y"], level="level2"), kidx.drop(["x", "y"], level="level2")) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop("a"), psidx.drop("a")) + self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"])) + self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1)) + self.assert_eq( + pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2") + ) pidx.names = ["lv1", "lv2"] - kidx.names = ["lv1", "lv2"] - self.assert_eq(pidx.drop(["x", "y"], level="lv2"), kidx.drop(["x", "y"], level="lv2")) + psidx.names = ["lv1", "lv2"] + self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2")) - self.assertRaises(IndexError, lambda: kidx.drop(["a", "b"], level=2)) - self.assertRaises(KeyError, lambda: kidx.drop(["a", "b"], level="level")) + self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2)) + self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level")) - kidx.names = ["lv", "lv"] - self.assertRaises(ValueError, lambda: kidx.drop(["x", "y"], level="lv")) + psidx.names = ["lv", "lv"] + self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv")) def test_sort_values(self): pidx = pd.Index([-10, -100, 200, 100]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.sort_values(), kidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), psidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) pidx.name = "koalas" - kidx.name = "koalas" + psidx.name = "koalas" - self.assert_eq(pidx.sort_values(), kidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), psidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) pidx.names = ["hello", "koalas", "goodbye"] - kidx.names = ["hello", "koalas", "goodbye"] + psidx.names = ["hello", "koalas", "goodbye"] - self.assert_eq(pidx.sort_values(), kidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), psidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) def test_index_drop_duplicates(self): pidx = pd.Index([1, 1, 2]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) def test_index_sort(self): idx = ps.Index([1, 2, 3, 4, 5]) @@ -824,204 +826,208 @@ def test_index_sort(self): midx.sort() def test_multiindex_isna(self): - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - kidx.isna() + psidx.isna() with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - kidx.isnull() + psidx.isnull() with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - kidx.notna() + psidx.notna() with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - kidx.notnull() + psidx.notnull() def test_index_nunique(self): pidx = pd.Index([1, 1, 2, None]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.nunique(), kidx.nunique()) - self.assert_eq(pidx.nunique(dropna=True), kidx.nunique(dropna=True)) + self.assert_eq(pidx.nunique(), psidx.nunique()) + self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True)) def test_multiindex_nunique(self): - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - kidx.notnull() + psidx.notnull() def test_multiindex_rename(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) pidx = pidx.rename(list("ABC")) - kidx = kidx.rename(list("ABC")) - self.assert_eq(pidx, kidx) + psidx = psidx.rename(list("ABC")) + self.assert_eq(pidx, psidx) pidx = pidx.rename(["my", "name", "is"]) - kidx = kidx.rename(["my", "name", "is"]) - self.assert_eq(pidx, kidx) + psidx = psidx.rename(["my", "name", "is"]) + self.assert_eq(pidx, psidx) def test_multiindex_set_names(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) pidx = pidx.set_names(["set", "new", "names"]) - kidx = kidx.set_names(["set", "new", "names"]) - self.assert_eq(pidx, kidx) + psidx = psidx.set_names(["set", "new", "names"]) + self.assert_eq(pidx, psidx) pidx.set_names(["set", "new", "names"], inplace=True) - kidx.set_names(["set", "new", "names"], inplace=True) - self.assert_eq(pidx, kidx) + psidx.set_names(["set", "new", "names"], inplace=True) + self.assert_eq(pidx, psidx) pidx = pidx.set_names("first", level=0) - kidx = kidx.set_names("first", level=0) - self.assert_eq(pidx, kidx) + psidx = psidx.set_names("first", level=0) + self.assert_eq(pidx, psidx) pidx = pidx.set_names("second", level=1) - kidx = kidx.set_names("second", level=1) - self.assert_eq(pidx, kidx) + psidx = psidx.set_names("second", level=1) + self.assert_eq(pidx, psidx) pidx = pidx.set_names("third", level=2) - kidx = kidx.set_names("third", level=2) - self.assert_eq(pidx, kidx) + psidx = psidx.set_names("third", level=2) + self.assert_eq(pidx, psidx) pidx.set_names("first", level=0, inplace=True) - kidx.set_names("first", level=0, inplace=True) - self.assert_eq(pidx, kidx) + psidx.set_names("first", level=0, inplace=True) + self.assert_eq(pidx, psidx) pidx.set_names("second", level=1, inplace=True) - kidx.set_names("second", level=1, inplace=True) - self.assert_eq(pidx, kidx) + psidx.set_names("second", level=1, inplace=True) + self.assert_eq(pidx, psidx) pidx.set_names("third", level=2, inplace=True) - kidx.set_names("third", level=2, inplace=True) - self.assert_eq(pidx, kidx) + psidx.set_names("third", level=2, inplace=True) + self.assert_eq(pidx, psidx) def test_multiindex_from_tuples(self): tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] pidx = pd.MultiIndex.from_tuples(tuples) - kidx = ps.MultiIndex.from_tuples(tuples) + psidx = ps.MultiIndex.from_tuples(tuples) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) def test_multiindex_from_product(self): iterables = [[0, 1, 2], ["green", "purple"]] pidx = pd.MultiIndex.from_product(iterables) - kidx = ps.MultiIndex.from_product(iterables) + psidx = ps.MultiIndex.from_product(iterables) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) def test_multiindex_tuple_column_name(self): column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels) pdf.set_index(("a", "x"), append=True, inplace=True) - kdf = ps.from_pandas(pdf) - self.assert_eq(pdf, kdf) + psdf = ps.from_pandas(pdf) + self.assert_eq(pdf, psdf) def test_len(self): pidx = pd.Index(range(10000)) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(len(pidx), len(kidx)) + self.assert_eq(len(pidx), len(psidx)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - self.assert_eq(len(pidx), len(kidx)) + self.assert_eq(len(pidx), len(psidx)) def test_delete(self): pidx = pd.Index([10, 9, 8, 7, 6, 7, 8, 9, 10]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.delete(8).sort_values(), kidx.delete(8).sort_values()) - self.assert_eq(pidx.delete(-9).sort_values(), kidx.delete(-9).sort_values()) - self.assert_eq(pidx.delete([-9, 0, 8]).sort_values(), kidx.delete([-9, 0, 8]).sort_values()) + self.assert_eq(pidx.delete(8).sort_values(), psidx.delete(8).sort_values()) + self.assert_eq(pidx.delete(-9).sort_values(), psidx.delete(-9).sort_values()) + self.assert_eq( + pidx.delete([-9, 0, 8]).sort_values(), psidx.delete([-9, 0, 8]).sort_values() + ) with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"): - kidx.delete([0, 9]) + psidx.delete([0, 9]) with self.assertRaisesRegex( IndexError, "index -10 is out of bounds for axis 0 with size 9" ): - kidx.delete([-10, 0]) + psidx.delete([-10, 0]) with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"): - kidx.delete(9) + psidx.delete(9) with self.assertRaisesRegex( IndexError, "index -10 is out of bounds for axis 0 with size 9" ): - kidx.delete(-10) + psidx.delete(-10) # MultiIndex pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - self.assert_eq(pidx.delete(2).sort_values(), kidx.delete(2).sort_values()) - self.assert_eq(pidx.delete(-3).sort_values(), kidx.delete(-3).sort_values()) - self.assert_eq(pidx.delete([-3, 0, 2]).sort_values(), kidx.delete([-3, 0, 2]).sort_values()) + self.assert_eq(pidx.delete(2).sort_values(), psidx.delete(2).sort_values()) + self.assert_eq(pidx.delete(-3).sort_values(), psidx.delete(-3).sort_values()) + self.assert_eq( + pidx.delete([-3, 0, 2]).sort_values(), psidx.delete([-3, 0, 2]).sort_values() + ) with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"): - kidx.delete([0, 3]) + psidx.delete([0, 3]) with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"): - kidx.delete([-4, 0]) + psidx.delete([-4, 0]) with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"): - kidx.delete(3) + psidx.delete(3) with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"): - kidx.delete(-4) + psidx.delete(-4) def test_append(self): # Index pidx = pd.Index(range(10000)) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.append(pidx), kidx.append(kidx)) + self.assert_eq(pidx.append(pidx), psidx.append(psidx)) # Index with name pidx1 = pd.Index(range(10000), name="a") pidx2 = pd.Index(range(10000), name="b") - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) - self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) # Index from DataFrame pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}, index=["x", "y", "z"]) - kdf1 = ps.from_pandas(pdf1) - kdf2 = ps.from_pandas(pdf2) + psdf1 = ps.from_pandas(pdf1) + psdf2 = ps.from_pandas(pdf2) pidx1 = pdf1.set_index("a").index pidx2 = pdf2.set_index("d").index - kidx1 = kdf1.set_index("a").index - kidx2 = kdf2.set_index("d").index + psidx1 = psdf1.set_index("a").index + psidx2 = psdf2.set_index("d").index - self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) - self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) # Index from DataFrame with MultiIndex columns pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}) pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")]) - kdf1 = ps.from_pandas(pdf1) - kdf2 = ps.from_pandas(pdf2) + psdf1 = ps.from_pandas(pdf1) + psdf2 = ps.from_pandas(pdf2) pidx1 = pdf1.set_index(("a", "x")).index pidx2 = pdf2.set_index(("d", "y")).index - kidx1 = kdf1.set_index(("a", "x")).index - kidx2 = kdf2.set_index(("d", "y")).index + psidx1 = psdf1.set_index(("a", "x")).index + psidx2 = psdf2.set_index(("d", "y")).index - self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) + self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) - self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) + self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.append(pmidx), kmidx.append(kmidx)) + self.assert_eq(pmidx.append(pmidx), psmidx.append(psmidx)) # MultiIndex with names pmidx1 = pd.MultiIndex.from_tuples( @@ -1030,83 +1036,83 @@ def test_append(self): pmidx2 = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"] ) - kmidx1 = ps.from_pandas(pmidx1) - kmidx2 = ps.from_pandas(pmidx2) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.append(pmidx2), kmidx1.append(kmidx2)) + self.assert_eq(pmidx1.append(pmidx2), psmidx1.append(psmidx2)) - self.assert_eq(pmidx2.append(pmidx1), kmidx2.append(kmidx1)) + self.assert_eq(pmidx2.append(pmidx1), psmidx2.append(psmidx1)) - self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names) - self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names) # Index & MultiIndex currently is not supported expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported" with self.assertRaisesRegex(NotImplementedError, expected_error_message): - kidx.append(kmidx) + psidx.append(psmidx) with self.assertRaisesRegex(NotImplementedError, expected_error_message): - kmidx.append(kidx) + psmidx.append(psidx) def test_argmin(self): pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.argmin(), kidx.argmin()) + self.assert_eq(pidx.argmin(), psidx.argmin()) # MultiIndex - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmin' not allowed for this dtype" ): - kidx.argmin() + psidx.argmin() def test_argmax(self): pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.argmax(), kidx.argmax()) + self.assert_eq(pidx.argmax(), psidx.argmax()) # MultiIndex - kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmax' not allowed for this dtype" ): - kidx.argmax() + psidx.argmax() def test_min(self): pidx = pd.Index([3, 2, 1]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.min(), kidx.min()) + self.assert_eq(pidx.min(), psidx.min()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.min(), kmidx.min()) + self.assert_eq(pmidx.min(), psmidx.min()) pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.min(), kidx.min()) + self.assert_eq(pidx.min(), psidx.min()) def test_max(self): pidx = pd.Index([3, 2, 1]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.max(), kidx.max()) + self.assert_eq(pidx.max(), psidx.max()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.max(), kmidx.max()) + self.assert_eq(pmidx.max(), psmidx.max()) pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.max(), kidx.max()) + self.assert_eq(pidx.max(), psidx.max()) def test_monotonic(self): # test monotonic_increasing & monotonic_decreasing for MultiIndex. @@ -1174,9 +1180,9 @@ def test_monotonic(self): for data in datas: with self.subTest(data=data): pmidx = pd.MultiIndex.from_tuples(data) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) # datas below return different result depends on pandas version. # Because the behavior of handling null values is changed in pandas >= 1.1.4. @@ -1209,13 +1215,13 @@ def test_monotonic(self): for data in datas: with self.subTest(data=data): pmidx = pd.MultiIndex.from_tuples(data) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) if LooseVersion(pd.__version__) < LooseVersion("1.1.4"): - self.assert_eq(kmidx.is_monotonic_increasing, False) - self.assert_eq(kmidx.is_monotonic_decreasing, False) + self.assert_eq(psmidx.is_monotonic_increasing, False) + self.assert_eq(psmidx.is_monotonic_decreasing, False) else: - self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) # The datas below are tested another way since they cannot be an arguments for # `MultiIndex.from_tuples` in pandas >= 1.1.0. @@ -1224,104 +1230,105 @@ def test_monotonic(self): pmidx = pd.MultiIndex.from_tuples( [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] ) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.is_monotonic_increasing, False) - self.assert_eq(kmidx.is_monotonic_decreasing, False) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.is_monotonic_increasing, False) + self.assert_eq(psmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples( [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] ) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.is_monotonic_increasing, False) - self.assert_eq(kmidx.is_monotonic_decreasing, False) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.is_monotonic_increasing, False) + self.assert_eq(psmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples( [(None, None), (None, None), (None, None), (None, None), (None, None)] ) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.is_monotonic_increasing, False) - self.assert_eq(kmidx.is_monotonic_decreasing, False) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.is_monotonic_increasing, False) + self.assert_eq(psmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples([(None, None)]) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.is_monotonic_increasing, False) - self.assert_eq(kmidx.is_monotonic_decreasing, False) - - # Disable the test cases below because pandas returns `True` or `False` randomly. - # else: - # [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] - # kdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]}) - # kdf["b"] = None - # kmidx = kdf.set_index(["a", "b"]).index - # pmidx = kmidx.to_pandas() - # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - - # [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] - # kdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]}) - # kdf["a"] = None - # kmidx = kdf.set_index(["a", "b"]).index - # pmidx = kmidx.to_pandas() - # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - - # [(None, None), (None, None), (None, None), (None, None), (None, None)] - # kdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]}) - # kdf["a"] = None - # kdf["b"] = None - # kmidx = kdf.set_index(["a", "b"]).index - # pmidx = kmidx.to_pandas() - # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - # [(None, None)] - # kdf = ps.DataFrame({"a": [1], "b": [1]}) - # kdf["a"] = None - # kdf["b"] = None - # kmidx = kdf.set_index(["a", "b"]).index - # pmidx = kmidx.to_pandas() - # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.is_monotonic_increasing, False) + self.assert_eq(psmidx.is_monotonic_decreasing, False) + + else: + [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] + psdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]}) + psdf["b"] = None + psmidx = psdf.set_index(["a", "b"]).index + pmidx = psmidx.to_pandas() + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + + [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] + psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]}) + psdf["a"] = None + psmidx = psdf.set_index(["a", "b"]).index + pmidx = psmidx.to_pandas() + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + + [(None, None), (None, None), (None, None), (None, None), (None, None)] + psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]}) + psdf["a"] = None + psdf["b"] = None + psmidx = psdf.set_index(["a", "b"]).index + pmidx = psmidx.to_pandas() + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + [(None, None)] + psdf = ps.DataFrame({"a": [1], "b": [1]}) + psdf["a"] = None + psdf["b"] = None + psmidx = psdf.set_index(["a", "b"]).index + pmidx = psmidx.to_pandas() + self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) def test_difference(self): # Index pidx1 = pd.Index([1, 2, 3, 4], name="koalas") pidx2 = pd.Index([3, 4, 5, 6], name="koalas") - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( - kidx1.difference([3, 4, 5, 6]).sort_values(), + psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values() + ) + self.assert_eq( + psidx1.difference([3, 4, 5, 6]).sort_values(), pidx1.difference([3, 4, 5, 6]).sort_values(), ) self.assert_eq( - kidx1.difference((3, 4, 5, 6)).sort_values(), + psidx1.difference((3, 4, 5, 6)).sort_values(), pidx1.difference((3, 4, 5, 6)).sort_values(), ) self.assert_eq( - kidx1.difference({3, 4, 5, 6}).sort_values(), + psidx1.difference({3, 4, 5, 6}).sort_values(), pidx1.difference({3, 4, 5, 6}).sort_values(), ) self.assert_eq( - kidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), + psidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), pidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), ) # Exceptions for Index with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx1.difference("1234") + psidx1.difference("1234") with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx1.difference(1234) + psidx1.difference(1234) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx1.difference(12.34) + psidx1.difference(12.34) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx1.difference(None) + psidx1.difference(None) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx1.difference(np.nan) + psidx1.difference(np.nan) with self.assertRaisesRegex( ValueError, "The 'sort' keyword only takes the values of None or True; 1 was passed." ): - kidx1.difference(kidx2, sort=1) + psidx1.difference(psidx2, sort=1) # MultiIndex pidx1 = pd.MultiIndex.from_tuples( @@ -1330,148 +1337,150 @@ def test_difference(self): pidx2 = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "z", 2), ("k", "z", 3)], names=["hello", "koalas", "world"] ) - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( - kidx1.difference({("a", "x", 1)}).sort_values(), + psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values() + ) + self.assert_eq( + psidx1.difference({("a", "x", 1)}).sort_values(), pidx1.difference({("a", "x", 1)}).sort_values(), ) self.assert_eq( - kidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), + psidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), pidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), ) # Exceptions for MultiIndex with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - kidx1.difference(["b", "z", "2"]) + psidx1.difference(["b", "z", "2"]) def test_repeat(self): pidx = pd.Index(["a", "b", "c"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) - self.assert_eq(kidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) - self.assert_eq((kidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) + self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) + self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) + self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) - self.assertRaises(ValueError, lambda: kidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kidx.repeat("abc")) + self.assertRaises(ValueError, lambda: psidx.repeat(-1)) + self.assertRaises(TypeError, lambda: psidx.repeat("abc")) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) - self.assert_eq(kmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) + self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) + self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) - self.assertRaises(ValueError, lambda: kmidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kmidx.repeat("abc")) + self.assertRaises(ValueError, lambda: psmidx.repeat(-1)) + self.assertRaises(TypeError, lambda: psmidx.repeat("abc")) def test_unique(self): pidx = pd.Index(["a", "b", "a"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.unique().sort_values(), pidx.unique().sort_values()) - self.assert_eq(kidx.unique().sort_values(), pidx.unique().sort_values()) + self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values()) + self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values()) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values()) - self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values()) + self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) + self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) def test_asof(self): # Increasing values pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(repr(kidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) - self.assert_eq(kidx.asof("2014-01-04"), pidx.asof("2014-01-04")) + self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) + self.assert_eq(psidx.asof("2014-01-04"), pidx.asof("2014-01-04")) pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(repr(kidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) + self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) # Decreasing values pidx = pd.Index(["2014-01-03", "2014-01-02", "2013-12-31"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(kidx.asof("1999-01-02"), pidx.asof("1999-01-02")) - self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) + self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02")) + self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) pidx = pd.DatetimeIndex(["2014-01-03", "2014-01-02", "2013-12-31"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) # TODO: a pandas bug? - # self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - # self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - # self.assert_eq(kidx.asof("1999-01-02"), pidx.asof("1999-01-02")) - # self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) - self.assert_eq(kidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00")) - self.assert_eq(kidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00")) - self.assert_eq(kidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00")) - self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pd.NaT)) + # self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + # self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + # self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02")) + # self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) + self.assert_eq(psidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00")) + self.assert_eq(psidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00")) + self.assert_eq(psidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00")) + self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pd.NaT)) # Not increasing, neither decreasing (ValueError) - kidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"]) - self.assertRaises(ValueError, lambda: kidx.asof("2013-12-31")) + psidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"]) + self.assertRaises(ValueError, lambda: psidx.asof("2013-12-31")) - kmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")]) - self.assertRaises(NotImplementedError, lambda: kmidx.asof(("a", "b"))) + psmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")]) + self.assertRaises(NotImplementedError, lambda: psmidx.asof(("a", "b"))) def test_union(self): # Index pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([3, 4, 5, 6]) - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(kidx1.union(kidx2), pidx1.union(pidx2)) - self.assert_eq(kidx2.union(kidx1), pidx2.union(pidx1)) - self.assert_eq(kidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True) - self.assert_eq(kidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True) + self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) + self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) + self.assert_eq(psidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True) + self.assert_eq(psidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True) self.assert_eq( - kidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True + psidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True ) self.assert_eq( - kidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True + psidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True ) # Testing if the result is correct after sort=False. # The `sort` argument is added in pandas 0.24. if LooseVersion(pd.__version__) >= LooseVersion("0.24"): self.assert_eq( - kidx1.union(kidx2, sort=False).sort_values(), + psidx1.union(psidx2, sort=False).sort_values(), pidx1.union(pidx2, sort=False).sort_values(), ) self.assert_eq( - kidx2.union(kidx1, sort=False).sort_values(), + psidx2.union(psidx1, sort=False).sort_values(), pidx2.union(pidx1, sort=False).sort_values(), ) self.assert_eq( - kidx1.union([3, 4, 5, 6], sort=False).sort_values(), + psidx1.union([3, 4, 5, 6], sort=False).sort_values(), pidx1.union([3, 4, 5, 6], sort=False).sort_values(), almost=True, ) self.assert_eq( - kidx2.union([1, 2, 3, 4], sort=False).sort_values(), + psidx2.union([1, 2, 3, 4], sort=False).sort_values(), pidx2.union([1, 2, 3, 4], sort=False).sort_values(), almost=True, ) self.assert_eq( - kidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), + psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(), almost=True, ) self.assert_eq( - kidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), + psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(), almost=True, ) @@ -1480,26 +1489,26 @@ def test_union(self): if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4]) pidx2 = pd.Index([3, 4, 3, 4, 5, 6]) - kidx1 = ps.from_pandas(pidx1) - kidx2 = ps.from_pandas(pidx2) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(kidx1.union(kidx2), pidx1.union(pidx2)) - self.assert_eq(kidx2.union(kidx1), pidx2.union(pidx1)) + self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) + self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) self.assert_eq( - kidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True + psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True ) self.assert_eq( - kidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), + psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), almost=True, ) self.assert_eq( - kidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), + psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])), almost=True, ) self.assert_eq( - kidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), + psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])), almost=True, ) @@ -1509,29 +1518,29 @@ def test_union(self): pmidx2 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]) pmidx3 = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]) pmidx4 = pd.MultiIndex.from_tuples([(1, 3), (1, 4), (1, 5), (1, 6)]) - kmidx1 = ps.from_pandas(pmidx1) - kmidx2 = ps.from_pandas(pmidx2) - kmidx3 = ps.from_pandas(pmidx3) - kmidx4 = ps.from_pandas(pmidx4) - - self.assert_eq(kmidx1.union(kmidx2), pmidx1.union(pmidx2)) - self.assert_eq(kmidx2.union(kmidx1), pmidx2.union(pmidx1)) - self.assert_eq(kmidx3.union(kmidx4), pmidx3.union(pmidx4)) - self.assert_eq(kmidx4.union(kmidx3), pmidx4.union(pmidx3)) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + psmidx3 = ps.from_pandas(pmidx3) + psmidx4 = ps.from_pandas(pmidx4) + + self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2)) + self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1)) + self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4)) + self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3)) self.assert_eq( - kmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), + psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), ) self.assert_eq( - kmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), + psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), ) self.assert_eq( - kmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), + psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), ) self.assert_eq( - kmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), + psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), ) @@ -1539,23 +1548,23 @@ def test_union(self): # The `sort` argument is added in pandas 0.24. if LooseVersion(pd.__version__) >= LooseVersion("0.24"): self.assert_eq( - kmidx1.union(kmidx2, sort=False).sort_values(), + psmidx1.union(psmidx2, sort=False).sort_values(), pmidx1.union(pmidx2, sort=False).sort_values(), ) self.assert_eq( - kmidx2.union(kmidx1, sort=False).sort_values(), + psmidx2.union(psmidx1, sort=False).sort_values(), pmidx2.union(pmidx1, sort=False).sort_values(), ) self.assert_eq( - kmidx3.union(kmidx4, sort=False).sort_values(), + psmidx3.union(psmidx4, sort=False).sort_values(), pmidx3.union(pmidx4, sort=False).sort_values(), ) self.assert_eq( - kmidx4.union(kmidx3, sort=False).sort_values(), + psmidx4.union(psmidx3, sort=False).sort_values(), pmidx4.union(pmidx3, sort=False).sort_values(), ) self.assert_eq( - kmidx1.union( + psmidx1.union( [("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False ).sort_values(), pmidx1.union( @@ -1563,7 +1572,7 @@ def test_union(self): ).sort_values(), ) self.assert_eq( - kmidx2.union( + psmidx2.union( [("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False ).sort_values(), pmidx2.union( @@ -1571,11 +1580,11 @@ def test_union(self): ).sort_values(), ) self.assert_eq( - kmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), + psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), ) self.assert_eq( - kmidx4.union( + psmidx4.union( [(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False ).sort_values(), pmidx4.union( @@ -1583,330 +1592,331 @@ def test_union(self): ).sort_values(), ) - self.assertRaises(NotImplementedError, lambda: kidx1.union(kmidx1)) - self.assertRaises(TypeError, lambda: kmidx1.union(kidx1)) - self.assertRaises(TypeError, lambda: kmidx1.union(["x", "a"])) - self.assertRaises(ValueError, lambda: kidx1.union(ps.range(2))) + self.assertRaises(NotImplementedError, lambda: psidx1.union(psmidx1)) + self.assertRaises(TypeError, lambda: psmidx1.union(psidx1)) + self.assertRaises(TypeError, lambda: psmidx1.union(["x", "a"])) + self.assertRaises(ValueError, lambda: psidx1.union(ps.range(2))) def test_take(self): # Index pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values()) + self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values()) self.assert_eq( - kidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values() + psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values() ) - self.assert_eq(kidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values()) + self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values()) self.assert_eq( - kidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values() + psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values() ) # MultiIndex pmidx = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"] ) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(kmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values()) + self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values()) self.assert_eq( - kmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values() + psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values() ) - self.assert_eq(kmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values()) + self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values()) self.assert_eq( - kmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values() + psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values() ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kidx.take(1)) - self.assertRaises(ValueError, lambda: kidx.take("1")) - self.assertRaises(ValueError, lambda: kidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kidx.take({1: None, 2: None})) - self.assertRaises(ValueError, lambda: kmidx.take(1)) - self.assertRaises(ValueError, lambda: kmidx.take("1")) - self.assertRaises(ValueError, lambda: kmidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kmidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: psidx.take(1)) + self.assertRaises(TypeError, lambda: psidx.take("1")) + self.assertRaises(TypeError, lambda: psidx.take({1, 2})) + self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: psmidx.take(1)) + self.assertRaises(TypeError, lambda: psmidx.take("1")) + self.assertRaises(TypeError, lambda: psmidx.take({1, 2})) + self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None})) def test_index_get_level_values(self): pidx = pd.Index([1, 2, 3], name="ks") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) for level in [0, "ks"]: - self.assert_eq(kidx.get_level_values(level), pidx.get_level_values(level)) + self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level)) def test_multiindex_get_level_values(self): pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]) pmidx.names = ["level_1", "level_2"] - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) for level in [0, 1, "level_1", "level_2"]: - self.assert_eq(kmidx.get_level_values(level), pmidx.get_level_values(level)) + self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level)) def test_index_get_level_number(self): # name of two levels are the same, which is None - kdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) + psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) with self.assertRaisesRegex( ValueError, "The name None occurs multiple times, use a level number" ): - kdf.index._get_level_number(None) + psdf.index._get_level_number(None) mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) mi.names = ["level_1", "level_2"] - kdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) + psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) # level is not int and not in the level name list with self.assertRaisesRegex(KeyError, "Level lv_3 not found"): - kdf.index._get_level_number("lv_3") + psdf.index._get_level_number("lv_3") # level is int, but an invalid negative number with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - kdf.index._get_level_number(-3) + psdf.index._get_level_number(-3) # level is int, but an invalid positive number with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - kdf.index._get_level_number(3) + psdf.index._get_level_number(3) # Correct and valid inputs in numbers level_number = [-2, -1, 0, 1] outputs = [0, 1, 0, 1] for lv, output in zip(level_number, outputs): - self.assertEqual(output, kdf.index._get_level_number(lv)) + self.assertEqual(output, psdf.index._get_level_number(lv)) # Valid inputs as level names level_names = ["level_1", "level_2"] outputs = [0, 1] for lv, output in zip(level_names, outputs): - self.assertEqual(output, kdf.index._get_level_number(lv)) + self.assertEqual(output, psdf.index._get_level_number(lv)) def test_holds_integer(self): pidx = pd.Index([1, 2, 3, 4]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) pidx = pd.Index([1.1, 2.2, 3.3, 4.4]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) pidx = pd.Index(["A", "B", "C", "D"]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "a")]) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.holds_integer(), kmidx.holds_integer()) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer()) pmidx = pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20, 1)]) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.holds_integer(), kmidx.holds_integer()) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer()) def test_abs(self): pidx = pd.Index([-2, -1, 0, 1]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(abs(pidx), abs(kidx)) - self.assert_eq(np.abs(pidx), np.abs(kidx)) + self.assert_eq(abs(pidx), abs(psidx)) + self.assert_eq(np.abs(pidx), np.abs(psidx)) - kidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) + psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): - abs(kidx) + abs(psidx) def test_hasnans(self): # BooleanType pidx = pd.Index([True, False, True, True]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, kidx.hasnans) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, psidx.hasnans) pidx = pd.Index([True, False, np.nan, True]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, kidx.hasnans) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, psidx.hasnans) # TimestampType pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) # Not supported for MultiIndex - kmidx = ps.Index([("a", 1), ("b", 2)]) - self.assertRaises(NotImplementedError, lambda: kmidx.hasnans()) + psmidx = ps.Index([("a", 1), ("b", 2)]) + self.assertRaises(NotImplementedError, lambda: psmidx.hasnans()) def test_intersection(self): pidx = pd.Index([1, 2, 3, 4], name="Koalas") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) # other = Index pidx_other = pd.Index([3, 4, 5, 6], name="Koalas") - kidx_other = ps.from_pandas(pidx_other) - self.assert_eq(pidx.intersection(pidx_other), kidx.intersection(kidx_other).sort_values()) + psidx_other = ps.from_pandas(pidx_other) + self.assert_eq(pidx.intersection(pidx_other), psidx.intersection(psidx_other).sort_values()) self.assert_eq( - (pidx + 1).intersection(pidx_other), (kidx + 1).intersection(kidx_other).sort_values() + (pidx + 1).intersection(pidx_other), (psidx + 1).intersection(psidx_other).sort_values() ) pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks") - kidx_other_different_name = ps.from_pandas(pidx_other_different_name) + psidx_other_different_name = ps.from_pandas(pidx_other_different_name) self.assert_eq( pidx.intersection(pidx_other_different_name), - kidx.intersection(kidx_other_different_name).sort_values(), + psidx.intersection(psidx_other_different_name).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pidx_other_different_name), - (kidx + 1).intersection(kidx_other_different_name).sort_values(), + (psidx + 1).intersection(psidx_other_different_name).sort_values(), ) pidx_other_from_frame = pd.DataFrame({"a": [3, 4, 5, 6]}).set_index("a").index - kidx_other_from_frame = ps.from_pandas(pidx_other_from_frame) + psidx_other_from_frame = ps.from_pandas(pidx_other_from_frame) self.assert_eq( pidx.intersection(pidx_other_from_frame), - kidx.intersection(kidx_other_from_frame).sort_values(), + psidx.intersection(psidx_other_from_frame).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pidx_other_from_frame), - (kidx + 1).intersection(kidx_other_from_frame).sort_values(), + (psidx + 1).intersection(psidx_other_from_frame).sort_values(), ) # other = MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - kidx.intersection(kmidx).sort_values(), - kidx._kdf.head(0).index.rename(None), + psidx.intersection(psmidx).sort_values(), + psidx._psdf.head(0).index.rename(None), almost=True, ) self.assert_eq( - (kidx + 1).intersection(kmidx).sort_values(), - kidx._kdf.head(0).index.rename(None), + (psidx + 1).intersection(psmidx).sort_values(), + psidx._psdf.head(0).index.rename(None), almost=True, ) else: self.assert_eq( - pidx.intersection(pmidx), kidx.intersection(kmidx).sort_values(), almost=True + pidx.intersection(pmidx), psidx.intersection(psmidx).sort_values(), almost=True ) self.assert_eq( (pidx + 1).intersection(pmidx), - (kidx + 1).intersection(kmidx).sort_values(), + (psidx + 1).intersection(psmidx).sort_values(), almost=True, ) # other = Series pser = pd.Series([3, 4, 5, 6]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): - self.assert_eq(kidx.intersection(kser).sort_values(), ps.Index([3, 4], name="Koalas")) + self.assert_eq(psidx.intersection(psser).sort_values(), ps.Index([3, 4], name="Koalas")) self.assert_eq( - (kidx + 1).intersection(kser).sort_values(), ps.Index([3, 4, 5], name="Koalas") + (psidx + 1).intersection(psser).sort_values(), ps.Index([3, 4, 5], name="Koalas") ) else: - self.assert_eq(pidx.intersection(pser), kidx.intersection(kser).sort_values()) + self.assert_eq(pidx.intersection(pser), psidx.intersection(psser).sort_values()) self.assert_eq( - (pidx + 1).intersection(pser), (kidx + 1).intersection(kser).sort_values() + (pidx + 1).intersection(pser), (psidx + 1).intersection(psser).sort_values() ) pser_different_name = pd.Series([3, 4, 5, 6], name="Databricks") - kser_different_name = ps.from_pandas(pser_different_name) + psser_different_name = ps.from_pandas(pser_different_name) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - kidx.intersection(kser_different_name).sort_values(), + psidx.intersection(psser_different_name).sort_values(), ps.Index([3, 4], name="Koalas"), ) self.assert_eq( - (kidx + 1).intersection(kser_different_name).sort_values(), + (psidx + 1).intersection(psser_different_name).sort_values(), ps.Index([3, 4, 5], name="Koalas"), ) else: self.assert_eq( pidx.intersection(pser_different_name), - kidx.intersection(kser_different_name).sort_values(), + psidx.intersection(psser_different_name).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pser_different_name), - (kidx + 1).intersection(kser_different_name).sort_values(), + (psidx + 1).intersection(psser_different_name).sort_values(), ) others = ([3, 4, 5, 6], (3, 4, 5, 6), {3: None, 4: None, 5: None, 6: None}) for other in others: if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - kidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas") + psidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas") ) self.assert_eq( - (kidx + 1).intersection(other).sort_values(), ps.Index([3, 4, 5], name="Koalas") + (psidx + 1).intersection(other).sort_values(), + ps.Index([3, 4, 5], name="Koalas"), ) else: - self.assert_eq(pidx.intersection(other), kidx.intersection(other).sort_values()) + self.assert_eq(pidx.intersection(other), psidx.intersection(other).sort_values()) self.assert_eq( - (pidx + 1).intersection(other), (kidx + 1).intersection(other).sort_values() + (pidx + 1).intersection(other), (psidx + 1).intersection(other).sort_values() ) # MultiIndex / other = Index self.assert_eq( - pmidx.intersection(pidx), kmidx.intersection(kidx).sort_values(), almost=True + pmidx.intersection(pidx), psmidx.intersection(psidx).sort_values(), almost=True ) self.assert_eq( pmidx.intersection(pidx_other_from_frame), - kmidx.intersection(kidx_other_from_frame).sort_values(), + psmidx.intersection(psidx_other_from_frame).sort_values(), almost=True, ) # MultiIndex / other = MultiIndex pmidx_other = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) - kmidx_other = ps.from_pandas(pmidx_other) + psmidx_other = ps.from_pandas(pmidx_other) self.assert_eq( - pmidx.intersection(pmidx_other), kmidx.intersection(kmidx_other).sort_values() + pmidx.intersection(pmidx_other), psmidx.intersection(psmidx_other).sort_values() ) # MultiIndex / other = list other = [("c", "z"), ("d", "w")] - self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) # MultiIndex / other = tuple other = (("c", "z"), ("d", "w")) - self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) # MultiIndex / other = dict other = {("c", "z"): None, ("d", "w"): None} - self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - kidx.intersection(4) + psidx.intersection(4) with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - kmidx.intersection(4) + psmidx.intersection(4) with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - kmidx.intersection(ps.Series([3, 4, 5, 6])) + psmidx.intersection(ps.Series([3, 4, 5, 6])) with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): - kidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) + psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): - kmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) + psmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) def test_item(self): pidx = pd.Index([10]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.item(), kidx.item()) + self.assert_eq(pidx.item(), psidx.item()) # with timestamp pidx = pd.Index([datetime(1990, 3, 9)]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.item(), kidx.item()) + self.assert_eq(pidx.item(), psidx.item()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.item(), kmidx.item()) + self.assert_eq(pmidx.item(), psmidx.item()) # MultiIndex with timestamp pmidx = pd.MultiIndex.from_tuples([(datetime(1990, 3, 9), datetime(2019, 8, 15))]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pidx.item(), kidx.item()) + self.assert_eq(pidx.item(), psidx.item()) err_msg = "can only convert an array of size 1 to a Python scalar" with self.assertRaisesRegex(ValueError, err_msg): @@ -1917,43 +1927,43 @@ def test_item(self): def test_inferred_type(self): # Integer pidx = pd.Index([1, 2, 3]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, kidx.inferred_type) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, psidx.inferred_type) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, kidx.inferred_type) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, psidx.inferred_type) # String pidx = pd.Index(["a", "b", "c"]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, kidx.inferred_type) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, psidx.inferred_type) # Boolean pidx = pd.Index([True, False, True, False]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, kidx.inferred_type) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, psidx.inferred_type) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.inferred_type, kmidx.inferred_type) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.inferred_type, psmidx.inferred_type) def test_multi_index_from_index(self): tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] pmidx = pd.Index(tuples) - kmidx = ps.Index(tuples) + psmidx = ps.Index(tuples) - self.assertTrue(isinstance(kmidx, ps.MultiIndex)) - self.assert_eq(pmidx, kmidx) + self.assertTrue(isinstance(psmidx, ps.MultiIndex)) + self.assert_eq(pmidx, psmidx) # Specify the `names` pmidx = pd.Index(tuples, names=["Hello", "Koalas"]) - kmidx = ps.Index(tuples, names=["Hello", "Koalas"]) + psmidx = ps.Index(tuples, names=["Hello", "Koalas"]) - self.assertTrue(isinstance(kmidx, ps.MultiIndex)) - self.assert_eq(pmidx, kmidx) + self.assertTrue(isinstance(psmidx, ps.MultiIndex)) + self.assert_eq(pmidx, psmidx) @unittest.skipIf( LooseVersion(pd.__version__) < LooseVersion("0.24"), @@ -1963,126 +1973,129 @@ def test_multiindex_from_frame(self): pdf = pd.DataFrame( [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"] ) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pidx = pd.MultiIndex.from_frame(pdf) - kidx = ps.MultiIndex.from_frame(kdf) + psidx = ps.MultiIndex.from_frame(psdf) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) # Specify `names` pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"]) - kidx = ps.MultiIndex.from_frame(kdf, names=["state", "observation"]) - self.assert_eq(pidx, kidx) + psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"]) + self.assert_eq(pidx, psidx) pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation")) - kidx = ps.MultiIndex.from_frame(kdf, names=("state", "observation")) - self.assert_eq(pidx, kidx) + psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation")) + self.assert_eq(pidx, psidx) # MultiIndex columns pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")]) pdf.columns = pidx - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pidx = pd.MultiIndex.from_frame(pdf) - kidx = ps.MultiIndex.from_frame(kdf) + psidx = ps.MultiIndex.from_frame(psdf) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) # tuples for names pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")]) - kidx = ps.MultiIndex.from_frame(kdf, names=[("a", "w"), ("b", "x")]) + psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")]) - self.assert_eq(pidx, kidx) + self.assert_eq(pidx, psidx) err_msg = "Input must be a DataFrame" with self.assertRaisesRegex(TypeError, err_msg): ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) - self.assertRaises(ValueError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) + self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab")) # non-string names self.assert_eq( - ps.MultiIndex.from_frame(kdf, names=[0, 1]), pd.MultiIndex.from_frame(pdf, names=[0, 1]) + ps.MultiIndex.from_frame(psdf, names=[0, 1]), + pd.MultiIndex.from_frame(pdf, names=[0, 1]), ) self.assert_eq( - ps.MultiIndex.from_frame(kdf, names=[("x", 0), ("y", 1)]), + ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]), pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]), ) pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]]) - kdf = ps.from_pandas(pdf) - self.assert_eq(ps.MultiIndex.from_frame(kdf), pd.MultiIndex.from_frame(pdf)) + psdf = ps.from_pandas(pdf) + self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf)) def test_is_type_compatible(self): data_types = ["integer", "floating", "string", "boolean"] # Integer pidx = pd.Index([1, 2, 3]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) # String pidx = pd.Index(["a", "b", "c"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) # Boolean pidx = pd.Index([True, False, True, False]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) for data_type in data_types: - self.assert_eq(pmidx.is_type_compatible(data_type), kmidx.is_type_compatible(data_type)) + self.assert_eq( + pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type) + ) def test_asi8(self): # Integer pidx = pd.Index([1, 2, 3]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) - self.assert_eq(pidx.astype("int").asi8, kidx.astype("int").asi8) - self.assert_eq(pidx.astype("int16").asi8, kidx.astype("int16").asi8) - self.assert_eq(pidx.astype("int8").asi8, kidx.astype("int8").asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) + self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8) + self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8) + self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8) # Integer with missing value pidx = pd.Index([1, 2, None, 4, 5]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) # Datetime pidx = pd.date_range(end="1/1/2018", periods=3) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) # String pidx = pd.Index(["a", "b", "c"]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) # Boolean pidx = pd.Index([True, False, True, False]) - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, kidx.asi8) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, psidx.asi8) # MultiIndex pmidx = pd.MultiIndex.from_tuples([(1, 2)]) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.asi8, kmidx.asi8) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.asi8, psmidx.asi8) def test_index_is_unique(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] @@ -2091,9 +2104,9 @@ def test_index_is_unique(self): for idx, name, expected in zip(indexes, names, is_uniq): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name)) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.is_unique, expected) + self.assertEqual(psdf.index.is_unique, expected) def test_multiindex_is_unique(self): indexes = [ @@ -2106,175 +2119,175 @@ def test_multiindex_is_unique(self): for idx, expected in zip(indexes, is_uniq): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assertEqual(kdf.index.is_unique, expected) + self.assertEqual(psdf.index.is_unique, expected) def test_view(self): pidx = pd.Index([1, 2, 3, 4], name="Koalas") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.view(), kidx.view()) + self.assert_eq(pidx.view(), psidx.view()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.view(), kmidx.view()) + self.assert_eq(pmidx.view(), psmidx.view()) def test_insert(self): # Integer pidx = pd.Index([1, 2, 3], name="Koalas") - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, 100), kidx.insert(1, 100)) - self.assert_eq(pidx.insert(-1, 100), kidx.insert(-1, 100)) - self.assert_eq(pidx.insert(100, 100), kidx.insert(100, 100)) - self.assert_eq(pidx.insert(-100, 100), kidx.insert(-100, 100)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, 100), psidx.insert(1, 100)) + self.assert_eq(pidx.insert(-1, 100), psidx.insert(-1, 100)) + self.assert_eq(pidx.insert(100, 100), psidx.insert(100, 100)) + self.assert_eq(pidx.insert(-100, 100), psidx.insert(-100, 100)) # Floating pidx = pd.Index([1.0, 2.0, 3.0], name="Koalas") - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, 100.0), kidx.insert(1, 100.0)) - self.assert_eq(pidx.insert(-1, 100.0), kidx.insert(-1, 100.0)) - self.assert_eq(pidx.insert(100, 100.0), kidx.insert(100, 100.0)) - self.assert_eq(pidx.insert(-100, 100.0), kidx.insert(-100, 100.0)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, 100.0), psidx.insert(1, 100.0)) + self.assert_eq(pidx.insert(-1, 100.0), psidx.insert(-1, 100.0)) + self.assert_eq(pidx.insert(100, 100.0), psidx.insert(100, 100.0)) + self.assert_eq(pidx.insert(-100, 100.0), psidx.insert(-100, 100.0)) # String pidx = pd.Index(["a", "b", "c"], name="Koalas") - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, "x"), kidx.insert(1, "x")) - self.assert_eq(pidx.insert(-1, "x"), kidx.insert(-1, "x")) - self.assert_eq(pidx.insert(100, "x"), kidx.insert(100, "x")) - self.assert_eq(pidx.insert(-100, "x"), kidx.insert(-100, "x")) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, "x"), psidx.insert(1, "x")) + self.assert_eq(pidx.insert(-1, "x"), psidx.insert(-1, "x")) + self.assert_eq(pidx.insert(100, "x"), psidx.insert(100, "x")) + self.assert_eq(pidx.insert(-100, "x"), psidx.insert(-100, "x")) # Boolean pidx = pd.Index([True, False, True, False], name="Koalas") - kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, True), kidx.insert(1, True)) - self.assert_eq(pidx.insert(-1, True), kidx.insert(-1, True)) - self.assert_eq(pidx.insert(100, True), kidx.insert(100, True)) - self.assert_eq(pidx.insert(-100, True), kidx.insert(-100, True)) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, True), psidx.insert(1, True)) + self.assert_eq(pidx.insert(-1, True), psidx.insert(-1, True)) + self.assert_eq(pidx.insert(100, True), psidx.insert(100, True)) + self.assert_eq(pidx.insert(-100, True), psidx.insert(-100, True)) # MultiIndex pmidx = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y"), ("c", "z")], names=["Hello", "Koalas"] ) - kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.insert(2, ("h", "j")), kmidx.insert(2, ("h", "j"))) - self.assert_eq(pmidx.insert(-1, ("h", "j")), kmidx.insert(-1, ("h", "j"))) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.insert(2, ("h", "j")), psmidx.insert(2, ("h", "j"))) + self.assert_eq(pmidx.insert(-1, ("h", "j")), psmidx.insert(-1, ("h", "j"))) err_msg = "index 4 is out of bounds for axis 0 with size 3" with self.assertRaisesRegex(IndexError, err_msg): - kmidx.insert(4, ("b", "y")) + psmidx.insert(4, ("b", "y")) def test_astype(self): pidx = pd.Index([10, 20, 15, 30, 45], name="x") - kidx = ps.Index(pidx) - - self.assert_eq(kidx.astype(int), pidx.astype(int)) - self.assert_eq(kidx.astype(np.int), pidx.astype(np.int)) - self.assert_eq(kidx.astype(np.int8), pidx.astype(np.int8)) - self.assert_eq(kidx.astype(np.int16), pidx.astype(np.int16)) - self.assert_eq(kidx.astype(np.int32), pidx.astype(np.int32)) - self.assert_eq(kidx.astype(np.int64), pidx.astype(np.int64)) - self.assert_eq(kidx.astype(np.byte), pidx.astype(np.byte)) - self.assert_eq(kidx.astype("int"), pidx.astype("int")) - self.assert_eq(kidx.astype("int8"), pidx.astype("int8")) - self.assert_eq(kidx.astype("int16"), pidx.astype("int16")) - self.assert_eq(kidx.astype("int32"), pidx.astype("int32")) - self.assert_eq(kidx.astype("int64"), pidx.astype("int64")) - self.assert_eq(kidx.astype("b"), pidx.astype("b")) - self.assert_eq(kidx.astype("byte"), pidx.astype("byte")) - self.assert_eq(kidx.astype("i"), pidx.astype("i")) - self.assert_eq(kidx.astype("long"), pidx.astype("long")) - self.assert_eq(kidx.astype("short"), pidx.astype("short")) - self.assert_eq(kidx.astype(np.float), pidx.astype(np.float)) - self.assert_eq(kidx.astype(np.float32), pidx.astype(np.float32)) - self.assert_eq(kidx.astype(np.float64), pidx.astype(np.float64)) - self.assert_eq(kidx.astype("float"), pidx.astype("float")) - self.assert_eq(kidx.astype("float32"), pidx.astype("float32")) - self.assert_eq(kidx.astype("float64"), pidx.astype("float64")) - self.assert_eq(kidx.astype("double"), pidx.astype("double")) - self.assert_eq(kidx.astype("f"), pidx.astype("f")) - self.assert_eq(kidx.astype(bool), pidx.astype(bool)) - self.assert_eq(kidx.astype("bool"), pidx.astype("bool")) - self.assert_eq(kidx.astype("?"), pidx.astype("?")) - self.assert_eq(kidx.astype(np.unicode_), pidx.astype(np.unicode_)) - self.assert_eq(kidx.astype("str"), pidx.astype("str")) - self.assert_eq(kidx.astype("U"), pidx.astype("U")) + psidx = ps.Index(pidx) + + self.assert_eq(psidx.astype(int), pidx.astype(int)) + self.assert_eq(psidx.astype(np.int), pidx.astype(np.int)) + self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8)) + self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16)) + self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32)) + self.assert_eq(psidx.astype(np.int64), pidx.astype(np.int64)) + self.assert_eq(psidx.astype(np.byte), pidx.astype(np.byte)) + self.assert_eq(psidx.astype("int"), pidx.astype("int")) + self.assert_eq(psidx.astype("int8"), pidx.astype("int8")) + self.assert_eq(psidx.astype("int16"), pidx.astype("int16")) + self.assert_eq(psidx.astype("int32"), pidx.astype("int32")) + self.assert_eq(psidx.astype("int64"), pidx.astype("int64")) + self.assert_eq(psidx.astype("b"), pidx.astype("b")) + self.assert_eq(psidx.astype("byte"), pidx.astype("byte")) + self.assert_eq(psidx.astype("i"), pidx.astype("i")) + self.assert_eq(psidx.astype("long"), pidx.astype("long")) + self.assert_eq(psidx.astype("short"), pidx.astype("short")) + self.assert_eq(psidx.astype(np.float), pidx.astype(np.float)) + self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32)) + self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64)) + self.assert_eq(psidx.astype("float"), pidx.astype("float")) + self.assert_eq(psidx.astype("float32"), pidx.astype("float32")) + self.assert_eq(psidx.astype("float64"), pidx.astype("float64")) + self.assert_eq(psidx.astype("double"), pidx.astype("double")) + self.assert_eq(psidx.astype("f"), pidx.astype("f")) + self.assert_eq(psidx.astype(bool), pidx.astype(bool)) + self.assert_eq(psidx.astype("bool"), pidx.astype("bool")) + self.assert_eq(psidx.astype("?"), pidx.astype("?")) + self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_)) + self.assert_eq(psidx.astype("str"), pidx.astype("str")) + self.assert_eq(psidx.astype("U"), pidx.astype("U")) pidx = pd.Index([10, 20, 15, 30, 45, None], name="x") - kidx = ps.Index(pidx) + psidx = ps.Index(pidx) pidx = pd.Index(["hi", "hi ", " ", " \t", "", None], name="x") - kidx = ps.Index(pidx) + psidx = ps.Index(pidx) - self.assert_eq(kidx.astype(bool), pidx.astype(bool)) - self.assert_eq(kidx.astype(str).to_numpy(), ["hi", "hi ", " ", " \t", "", "None"]) + self.assert_eq(psidx.astype(bool), pidx.astype(bool)) + self.assert_eq(psidx.astype(str).to_numpy(), ["hi", "hi ", " ", " \t", "", "None"]) pidx = pd.Index([True, False, None], name="x") - kidx = ps.Index(pidx) + psidx = ps.Index(pidx) - self.assert_eq(kidx.astype(bool), pidx.astype(bool)) + self.assert_eq(psidx.astype(bool), pidx.astype(bool)) pidx = pd.Index(["2020-10-27"], name="x") - kidx = ps.Index(pidx) + psidx = ps.Index(pidx) - self.assert_eq(kidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]")) + self.assert_eq(psidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]")) with self.assertRaisesRegex(TypeError, "not understood"): - kidx.astype("int63") + psidx.astype("int63") def test_to_list(self): # Index pidx = pd.Index([1, 2, 3, 4, 5]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) # MultiIndex tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] pmidx = pd.MultiIndex.from_tuples(tuples) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assert_eq(kidx.tolist(), pidx.tolist()) - self.assert_eq(kmidx.tolist(), pmidx.tolist()) + self.assert_eq(psidx.tolist(), pidx.tolist()) + self.assert_eq(psmidx.tolist(), pmidx.tolist()) def test_index_ops(self): pidx = pd.Index([1, 2, 3, 4, 5]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx * 100 + kidx * 10 + kidx, pidx * 100 + pidx * 10 + pidx) + self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx) pidx = pd.Index([1, 2, 3, 4, 5], name="a") - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) - self.assert_eq(kidx * 100 + kidx * 10 + kidx, pidx * 100 + pidx * 10 + pidx) + self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx) pdf = pd.DataFrame( index=pd.MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6)], names=["a", "b"]) ) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pidx1 = pdf.index.get_level_values(0) pidx2 = pdf.index.get_level_values(1) - kidx1 = kdf.index.get_level_values(0) - kidx2 = kdf.index.get_level_values(1) + psidx1 = psdf.index.get_level_values(0) + psidx2 = psdf.index.get_level_values(1) if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(kidx1 * 10 + kidx2, pidx1 * 10 + pidx2) + self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) else: - self.assert_eq(kidx1 * 10 + kidx2, (pidx1 * 10 + pidx2).rename(None)) + self.assert_eq(psidx1 * 10 + psidx2, (pidx1 * 10 + pidx2).rename(None)) def test_factorize(self): pidx = pd.Index(["a", "b", "a", "b"]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) pcodes, puniques = pidx.factorize(sort=True) - kcodes, kuniques = kidx.factorize() + kcodes, kuniques = psidx.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - kmidx = ps.from_pandas(pmidx) + psmidx = ps.from_pandas(pmidx) - self.assertRaises(PandasNotImplementedError, lambda: kmidx.factorize()) + self.assertRaises(PandasNotImplementedError, lambda: psmidx.factorize()) if __name__ == "__main__": @@ -2282,7 +2295,8 @@ def test_factorize(self): try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index eae26bc4c876b..2618084f3ac92 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -51,33 +51,33 @@ def pser(self): return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") @property - def kser(self): + def psser(self): return ps.from_pandas(self.pser) def test_series_ops(self): pser = self.pser - kser = self.kser + psser = self.psser - self.assert_eq(kser + 1, pser + 1) - self.assert_eq(1 + kser, 1 + pser) - self.assert_eq(kser + 1 + 10 * kser, pser + 1 + 10 * pser) - self.assert_eq(kser + 1 + 10 * kser.index, pser + 1 + 10 * pser.index) - self.assert_eq(kser.index + 1 + 10 * kser, pser.index + 1 + 10 * pser) + self.assert_eq(psser + 1, pser + 1) + self.assert_eq(1 + psser, 1 + pser) + self.assert_eq(psser + 1 + 10 * psser, pser + 1 + 10 * pser) + self.assert_eq(psser + 1 + 10 * psser.index, pser + 1 + 10 * pser.index) + self.assert_eq(psser.index + 1 + 10 * psser, pser.index + 1 + 10 * pser) def test_series_tuple_name(self): pser = self.pser pser.name = ("x", "a") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser, pser) - self.assert_eq(kser.name, pser.name) + self.assert_eq(psser, pser) + self.assert_eq(psser.name, pser.name) pser.name = ("y", "z") - kser.name = ("y", "z") + psser.name = ("y", "z") - self.assert_eq(kser, pser) - self.assert_eq(kser.name, pser.name) + self.assert_eq(psser, pser) + self.assert_eq(psser.name, pser.name) def test_repr_cache_invalidation(self): # If there is any cache, inplace operations should invalidate it. @@ -86,12 +86,12 @@ def test_repr_cache_invalidation(self): s.rename("a", inplace=True) self.assertEqual(s.__repr__(), s.rename("a").__repr__()) - def _check_extension(self, kser, pser): + def _check_extension(self, psser, pser): if LooseVersion("1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.2.2"): - self.assert_eq(kser, pser, check_exact=False) - self.assertTrue(isinstance(kser.dtype, extension_dtypes)) + self.assert_eq(psser, pser, check_exact=False) + self.assertTrue(isinstance(psser.dtype, extension_dtypes)) else: - self.assert_eq(kser, pser) + self.assert_eq(psser, pser) @unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are not available") def test_extension_dtypes(self): @@ -101,11 +101,11 @@ def test_extension_dtypes(self): pd.Series([1, 2, None, 4], dtype="Int32"), pd.Series([1, 2, None, 4], dtype="Int64"), ]: - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self._check_extension(kser, pser) - self._check_extension(kser + F.lit(1).cast("byte"), pser + 1) - self._check_extension(kser + kser, pser + pser) + self._check_extension(psser, pser) + self._check_extension(psser + F.lit(1).cast("byte"), pser + 1) + self._check_extension(psser + psser, pser + pser) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -113,17 +113,17 @@ def test_extension_dtypes(self): def test_extension_object_dtypes(self): # string pser = pd.Series(["a", None, "c", "d"], dtype="string") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self._check_extension(kser, pser) + self._check_extension(psser, pser) # boolean pser = pd.Series([True, False, True, None], dtype="boolean") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self._check_extension(kser, pser) - self._check_extension(kser & kser, pser & pser) - self._check_extension(kser | kser, pser | pser) + self._check_extension(psser, pser) + self._check_extension(psser & psser, pser & pser) + self._check_extension(psser | psser, pser | pser) @unittest.skipIf( not extension_float_dtypes_available, "pandas extension float dtypes are not available" @@ -133,11 +133,11 @@ def test_extension_float_dtypes(self): pd.Series([1.0, 2.0, None, 4.0], dtype="Float32"), pd.Series([1.0, 2.0, None, 4.0], dtype="Float64"), ]: - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self._check_extension(kser, pser) - self._check_extension(kser + 1, pser + 1) - self._check_extension(kser + kser, pser + pser) + self._check_extension(psser, pser) + self._check_extension(psser + 1, pser + 1) + self._check_extension(psser + psser, pser + pser) def test_empty_series(self): pser_a = pd.Series([], dtype="i1") @@ -145,8 +145,8 @@ def test_empty_series(self): self.assert_eq(ps.from_pandas(pser_a), pser_a) - kser_b = ps.from_pandas(pser_b) - self.assert_eq(kser_b, pser_b) + psser_b = ps.from_pandas(pser_b) + self.assert_eq(psser_b, pser_b) with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): self.assert_eq(ps.from_pandas(pser_a), pser_a) @@ -158,165 +158,165 @@ def test_all_null_series(self): self.assert_eq(ps.from_pandas(pser_a), pser_a) - kser_b = ps.from_pandas(pser_b) - self.assert_eq(kser_b, pser_b) + psser_b = ps.from_pandas(pser_b) + self.assert_eq(psser_b, pser_b) with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): self.assert_eq(ps.from_pandas(pser_a), pser_a) self.assert_eq(ps.from_pandas(pser_b), pser_b) def test_head(self): - kser = self.kser + psser = self.psser pser = self.pser - self.assert_eq(kser.head(3), pser.head(3)) - self.assert_eq(kser.head(0), pser.head(0)) - self.assert_eq(kser.head(-3), pser.head(-3)) - self.assert_eq(kser.head(-10), pser.head(-10)) + self.assert_eq(psser.head(3), pser.head(3)) + self.assert_eq(psser.head(0), pser.head(0)) + self.assert_eq(psser.head(-3), pser.head(-3)) + self.assert_eq(psser.head(-10), pser.head(-10)) def test_last(self): with self.assertRaises(TypeError): - self.kser.last("1D") + self.psser.last("1D") index = pd.date_range("2018-04-09", periods=4, freq="2D") pser = pd.Series([1, 2, 3, 4], index=index) - kser = ps.from_pandas(pser) - self.assert_eq(kser.last("1D"), pser.last("1D")) + psser = ps.from_pandas(pser) + self.assert_eq(psser.last("1D"), pser.last("1D")) def test_first(self): with self.assertRaises(TypeError): - self.kser.first("1D") + self.psser.first("1D") index = pd.date_range("2018-04-09", periods=4, freq="2D") pser = pd.Series([1, 2, 3, 4], index=index) - kser = ps.from_pandas(pser) - self.assert_eq(kser.first("1D"), pser.first("1D")) + psser = ps.from_pandas(pser) + self.assert_eq(psser.first("1D"), pser.first("1D")) def test_rename(self): pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pser.name = "renamed" - kser.name = "renamed" - self.assertEqual(kser.name, "renamed") - self.assert_eq(kser, pser) + psser.name = "renamed" + self.assertEqual(psser.name, "renamed") + self.assert_eq(psser, pser) pser.name = None - kser.name = None - self.assertEqual(kser.name, None) - self.assert_eq(kser, pser) + psser.name = None + self.assertEqual(psser.name, None) + self.assert_eq(psser, pser) pidx = pser.index - kidx = kser.index + psidx = psser.index pidx.name = "renamed" - kidx.name = "renamed" - self.assertEqual(kidx.name, "renamed") - self.assert_eq(kidx, pidx) + psidx.name = "renamed" + self.assertEqual(psidx.name, "renamed") + self.assert_eq(psidx, pidx) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - kser.name = ["renamed"] + psser.name = ["renamed"] with self.assertRaisesRegex(TypeError, expected_error_message): - kser.name = ["0", "1"] + psser.name = ["0", "1"] with self.assertRaisesRegex(TypeError, expected_error_message): ps.Series([1, 2, 3], name=["0", "1"]) def test_rename_method(self): # Series name pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.rename("y"), pser.rename("y")) - self.assertEqual(kser.name, "x") # no mutation - self.assert_eq(kser.rename(), pser.rename()) + self.assert_eq(psser.rename("y"), pser.rename("y")) + self.assertEqual(psser.name, "x") # no mutation + self.assert_eq(psser.rename(), pser.rename()) - self.assert_eq((kser.rename("y") + 1).head(), (pser.rename("y") + 1).head()) + self.assert_eq((psser.rename("y") + 1).head(), (pser.rename("y") + 1).head()) - kser.rename("z", inplace=True) + psser.rename("z", inplace=True) pser.rename("z", inplace=True) - self.assertEqual(kser.name, "z") - self.assert_eq(kser, pser) + self.assertEqual(psser.name, "z") + self.assert_eq(psser, pser) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - kser.rename(["0", "1"]) + psser.rename(["0", "1"]) # Series index # pser = pd.Series(['a', 'b', 'c', 'd', 'e', 'f', 'g'], name='x') - # kser = ps.from_pandas(s) + # psser = ps.from_pandas(s) # TODO: index - # res = kser.rename(lambda x: x ** 2) + # res = psser.rename(lambda x: x ** 2) # self.assert_eq(res, pser.rename(lambda x: x ** 2)) - # res = kser.rename(pser) + # res = psser.rename(pser) # self.assert_eq(res, pser.rename(pser)) - # res = kser.rename(kser) + # res = psser.rename(psser) # self.assert_eq(res, pser.rename(pser)) - # res = kser.rename(lambda x: x**2, inplace=True) - # self.assertis(res, kser) + # res = psser.rename(lambda x: x**2, inplace=True) + # self.assertis(res, psser) # s.rename(lambda x: x**2, inplace=True) - # self.assert_eq(kser, pser) + # self.assert_eq(psser, pser) def test_rename_axis(self): index = pd.Index(["A", "B", "C"], name="index") pser = pd.Series([1.0, 2.0, 3.0], index=index, name="name") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.rename_axis("index2").sort_index(), kser.rename_axis("index2").sort_index(), + pser.rename_axis("index2").sort_index(), psser.rename_axis("index2").sort_index(), ) self.assert_eq( (pser + 1).rename_axis("index2").sort_index(), - (kser + 1).rename_axis("index2").sort_index(), + (psser + 1).rename_axis("index2").sort_index(), ) pser2 = pser.copy() - kser2 = kser.copy() + psser2 = psser.copy() pser2.rename_axis("index2", inplace=True) - kser2.rename_axis("index2", inplace=True) - self.assert_eq(pser2.sort_index(), kser2.sort_index()) + psser2.rename_axis("index2", inplace=True) + self.assert_eq(pser2.sort_index(), psser2.sort_index()) - self.assertRaises(ValueError, lambda: kser.rename_axis(["index2", "index3"])) - self.assertRaises(TypeError, lambda: kser.rename_axis(mapper=["index2"], index=["index3"])) + self.assertRaises(ValueError, lambda: psser.rename_axis(["index2", "index3"])) + self.assertRaises(TypeError, lambda: psser.rename_axis(mapper=["index2"], index=["index3"])) # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): self.assert_eq( pser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), - kser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), + psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), ) self.assert_eq( pser.rename_axis(index=str.upper).sort_index(), - kser.rename_axis(index=str.upper).sort_index(), + psser.rename_axis(index=str.upper).sort_index(), ) else: - expected = kser + expected = psser expected.index.name = "index2" - result = kser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index() + result = psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index() self.assert_eq(expected, result) - expected = kser + expected = psser expected.index.name = "INDEX" - result = kser.rename_axis(index=str.upper).sort_index() + result = psser.rename_axis(index=str.upper).sort_index() self.assert_eq(expected, result) index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] ) pser = pd.Series([1.0, 2.0, 3.0], index=index, name="name") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pser.rename_axis(["index3", "index4"]).sort_index(), - kser.rename_axis(["index3", "index4"]).sort_index(), + psser.rename_axis(["index3", "index4"]).sort_index(), ) - self.assertRaises(ValueError, lambda: kser.rename_axis(["index3", "index4", "index5"])) + self.assertRaises(ValueError, lambda: psser.rename_axis(["index3", "index4", "index5"])) # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): @@ -324,25 +324,25 @@ def test_rename_axis(self): pser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index(), - kser.rename_axis( + psser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index(), ) self.assert_eq( pser.rename_axis(index=str.upper).sort_index(), - kser.rename_axis(index=str.upper).sort_index(), + psser.rename_axis(index=str.upper).sort_index(), ) else: - expected = kser + expected = psser expected.index.names = ["index3", "index4"] - result = kser.rename_axis( + result = psser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index() self.assert_eq(expected, result) expected.index.names = ["INDEX1", "INDEX2"] - result = kser.rename_axis(index=str.upper).sort_index() + result = psser.rename_axis(index=str.upper).sort_index() self.assert_eq(expected, result) def test_or(self): @@ -352,15 +352,15 @@ def test_or(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(kdf["left"] | kdf["right"], pdf["left"] | pdf["right"]) - self.assert_eq(kdf["left"] | True, pdf["left"] | True) - self.assert_eq(kdf["left"] | False, pdf["left"] | False) - self.assert_eq(kdf["left"] | None, pdf["left"] | None) - self.assert_eq(True | kdf["right"], True | pdf["right"]) - self.assert_eq(False | kdf["right"], False | pdf["right"]) - self.assert_eq(None | kdf["right"], None | pdf["right"]) + self.assert_eq(psdf["left"] | psdf["right"], pdf["left"] | pdf["right"]) + self.assert_eq(psdf["left"] | True, pdf["left"] | True) + self.assert_eq(psdf["left"] | False, pdf["left"] | False) + self.assert_eq(psdf["left"] | None, pdf["left"] | None) + self.assert_eq(True | psdf["right"], True | pdf["right"]) + self.assert_eq(False | psdf["right"], False | pdf["right"]) + self.assert_eq(None | psdf["right"], None | pdf["right"]) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -372,15 +372,15 @@ def test_or_extenstion_dtypes(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ).astype("boolean") - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self._check_extension(kdf["left"] | kdf["right"], pdf["left"] | pdf["right"]) - self._check_extension(kdf["left"] | True, pdf["left"] | True) - self._check_extension(kdf["left"] | False, pdf["left"] | False) - self._check_extension(kdf["left"] | pd.NA, pdf["left"] | pd.NA) - self._check_extension(True | kdf["right"], True | pdf["right"]) - self._check_extension(False | kdf["right"], False | pdf["right"]) - self._check_extension(pd.NA | kdf["right"], pd.NA | pdf["right"]) + self._check_extension(psdf["left"] | psdf["right"], pdf["left"] | pdf["right"]) + self._check_extension(psdf["left"] | True, pdf["left"] | True) + self._check_extension(psdf["left"] | False, pdf["left"] | False) + self._check_extension(psdf["left"] | pd.NA, pdf["left"] | pd.NA) + self._check_extension(True | psdf["right"], True | pdf["right"]) + self._check_extension(False | psdf["right"], False | pdf["right"]) + self._check_extension(pd.NA | psdf["right"], pd.NA | pdf["right"]) def test_and(self): pdf = pd.DataFrame( @@ -389,15 +389,15 @@ def test_and(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(kdf["left"] & kdf["right"], pdf["left"] & pdf["right"]) - self.assert_eq(kdf["left"] & True, pdf["left"] & True) - self.assert_eq(kdf["left"] & False, pdf["left"] & False) - self.assert_eq(kdf["left"] & None, pdf["left"] & None) - self.assert_eq(True & kdf["right"], True & pdf["right"]) - self.assert_eq(False & kdf["right"], False & pdf["right"]) - self.assert_eq(None & kdf["right"], None & pdf["right"]) + self.assert_eq(psdf["left"] & psdf["right"], pdf["left"] & pdf["right"]) + self.assert_eq(psdf["left"] & True, pdf["left"] & True) + self.assert_eq(psdf["left"] & False, pdf["left"] & False) + self.assert_eq(psdf["left"] & None, pdf["left"] & None) + self.assert_eq(True & psdf["right"], True & pdf["right"]) + self.assert_eq(False & psdf["right"], False & pdf["right"]) + self.assert_eq(None & psdf["right"], None & pdf["right"]) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -409,96 +409,96 @@ def test_and_extenstion_dtypes(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ).astype("boolean") - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self._check_extension(kdf["left"] & kdf["right"], pdf["left"] & pdf["right"]) - self._check_extension(kdf["left"] & True, pdf["left"] & True) - self._check_extension(kdf["left"] & False, pdf["left"] & False) - self._check_extension(kdf["left"] & pd.NA, pdf["left"] & pd.NA) - self._check_extension(True & kdf["right"], True & pdf["right"]) - self._check_extension(False & kdf["right"], False & pdf["right"]) - self._check_extension(pd.NA & kdf["right"], pd.NA & pdf["right"]) + self._check_extension(psdf["left"] & psdf["right"], pdf["left"] & pdf["right"]) + self._check_extension(psdf["left"] & True, pdf["left"] & True) + self._check_extension(psdf["left"] & False, pdf["left"] & False) + self._check_extension(psdf["left"] & pd.NA, pdf["left"] & pd.NA) + self._check_extension(True & psdf["right"], True & pdf["right"]) + self._check_extension(False & psdf["right"], False & pdf["right"]) + self._check_extension(pd.NA & psdf["right"], pd.NA & pdf["right"]) def test_to_numpy(self): pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - kser = ps.from_pandas(pser) - self.assert_eq(kser.to_numpy(), pser.values) + psser = ps.from_pandas(pser) + self.assert_eq(psser.to_numpy(), pser.values) def test_isin(self): pser = pd.Series(["lama", "cow", "lama", "beetle", "lama", "hippo"], name="animal") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) - self.assert_eq(kser.isin(np.array(["cow", "lama"])), pser.isin(np.array(["cow", "lama"]))) - self.assert_eq(kser.isin({"cow"}), pser.isin({"cow"})) + self.assert_eq(psser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) + self.assert_eq(psser.isin(np.array(["cow", "lama"])), pser.isin(np.array(["cow", "lama"]))) + self.assert_eq(psser.isin({"cow"}), pser.isin({"cow"})) msg = "only list-like objects are allowed to be passed to isin()" with self.assertRaisesRegex(TypeError, msg): - kser.isin(1) + psser.isin(1) def test_drop_duplicates(self): pdf = pd.DataFrame({"animal": ["lama", "cow", "lama", "beetle", "lama", "hippo"]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.animal - kser = kdf.animal + psser = psdf.animal - self.assert_eq(kser.drop_duplicates().sort_index(), pser.drop_duplicates().sort_index()) + self.assert_eq(psser.drop_duplicates().sort_index(), pser.drop_duplicates().sort_index()) self.assert_eq( - kser.drop_duplicates(keep="last").sort_index(), + psser.drop_duplicates(keep="last").sort_index(), pser.drop_duplicates(keep="last").sort_index(), ) # inplace - kser.drop_duplicates(keep=False, inplace=True) + psser.drop_duplicates(keep=False, inplace=True) pser.drop_duplicates(keep=False, inplace=True) - self.assert_eq(kser.sort_index(), pser.sort_index()) - self.assert_eq(kdf, pdf) + self.assert_eq(psser.sort_index(), pser.sort_index()) + self.assert_eq(psdf, pdf) def test_reindex(self): index = ["A", "B", "C", "D", "E"] pser = pd.Series([1.0, 2.0, 3.0, 4.0, None], index=index, name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser, kser) + self.assert_eq(pser, psser) self.assert_eq( - pser.reindex(["A", "B"]).sort_index(), kser.reindex(["A", "B"]).sort_index(), + pser.reindex(["A", "B"]).sort_index(), psser.reindex(["A", "B"]).sort_index(), ) self.assert_eq( pser.reindex(["A", "B", "2", "3"]).sort_index(), - kser.reindex(["A", "B", "2", "3"]).sort_index(), + psser.reindex(["A", "B", "2", "3"]).sort_index(), ) self.assert_eq( pser.reindex(["A", "E", "2"], fill_value=0).sort_index(), - kser.reindex(["A", "E", "2"], fill_value=0).sort_index(), + psser.reindex(["A", "E", "2"], fill_value=0).sort_index(), ) - self.assertRaises(TypeError, lambda: kser.reindex(index=123)) + self.assertRaises(TypeError, lambda: psser.reindex(index=123)) def test_reindex_like(self): data = [1.0, 2.0, None] index = pd.Index(["A", "B", "C"], name="index1") pser = pd.Series(data=data, index=index, name="name1") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) # Reindexing single Index on single Index data2 = [3.0, None, 4.0] index2 = pd.Index(["A", "C", "D"], name="index2") pser2 = pd.Series(data=data2, index=index2, name="name2") - kser2 = ps.from_pandas(pser2) + psser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), + pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), ) self.assert_eq( (pser + 1).reindex_like(pser2).sort_index(), - (kser + 1).reindex_like(kser2).sort_index(), + (psser + 1).reindex_like(psser2).sort_index(), ) # Reindexing MultiIndex on single Index @@ -506,24 +506,24 @@ def test_reindex_like(self): [("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"] ) pser2 = pd.Series(data=data2, index=index2, name="name2") - kser2 = ps.from_pandas(pser2) + psser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), + pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), ) - self.assertRaises(TypeError, lambda: kser.reindex_like(index2)) - self.assertRaises(AssertionError, lambda: kser2.reindex_like(kser)) + self.assertRaises(TypeError, lambda: psser.reindex_like(index2)) + self.assertRaises(AssertionError, lambda: psser2.reindex_like(psser)) # Reindexing MultiIndex on MultiIndex index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] ) pser = pd.Series(data=data, index=index, name="name1") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), + pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), ) # Reindexing with DataFrame @@ -531,88 +531,88 @@ def test_reindex_like(self): [("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"] ) pdf = pd.DataFrame(data=data, index=index2) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) self.assert_eq( - pser.reindex_like(pdf).sort_index(), kser.reindex_like(kdf).sort_index(), + pser.reindex_like(pdf).sort_index(), psser.reindex_like(psdf).sort_index(), ) def test_fillna(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.fillna(0), pser.fillna(0)) - self.assert_eq(kser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) + self.assert_eq(psser.fillna(0), pser.fillna(0)) + self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) - kser.fillna(0, inplace=True) + psser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) # test considering series does not have NA/NaN values - kser.fillna(0, inplace=True) + psser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(kser, pser) + self.assert_eq(psser, pser) - kser = kdf.x.rename("y") + psser = psdf.x.rename("y") pser = pdf.x.rename("y") - kser.fillna(0, inplace=True) + psser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(kser.head(), pser.head()) + self.assert_eq(psser.head(), pser.head()) pser = pd.Series([1, 2, 3, 4, 5, 6], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pser.loc[3] = np.nan - kser.loc[3] = np.nan + psser.loc[3] = np.nan - self.assert_eq(kser.fillna(0), pser.fillna(0)) - self.assert_eq(kser.fillna(method="ffill"), pser.fillna(method="ffill")) - self.assert_eq(kser.fillna(method="bfill"), pser.fillna(method="bfill")) + self.assert_eq(psser.fillna(0), pser.fillna(0)) + self.assert_eq(psser.fillna(method="ffill"), pser.fillna(method="ffill")) + self.assert_eq(psser.fillna(method="bfill"), pser.fillna(method="bfill")) # inplace fillna on non-nullable column pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.b - kser = kdf.b + psser = psdf.b - self.assert_eq(kser.fillna(0), pser.fillna(0)) - self.assert_eq(kser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) + self.assert_eq(psser.fillna(0), pser.fillna(0)) + self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) - kser.fillna(0, inplace=True) + psser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) def test_dropna(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.dropna(), pser.dropna()) + self.assert_eq(psser.dropna(), pser.dropna()) pser.dropna(inplace=True) - kser.dropna(inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + psser.dropna(inplace=True) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) def test_nunique(self): pser = pd.Series([1, 2, 1, np.nan]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) # Assert NaNs are dropped by default - nunique_result = kser.nunique() + nunique_result = psser.nunique() self.assertEqual(nunique_result, 2) self.assert_eq(nunique_result, pser.nunique()) # Assert including NaN values - nunique_result = kser.nunique(dropna=False) + nunique_result = psser.nunique(dropna=False) self.assertEqual(nunique_result, 3) self.assert_eq(nunique_result, pser.nunique(dropna=False)) @@ -627,105 +627,105 @@ def test_value_counts(self): index=[1, 2, 1, 3, 3, np.nan, 1, 4, 2, np.nan, 3, np.nan, 3, 1, 3], name="x", ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) exp = pser.value_counts() - res = kser.value_counts() + res = psser.value_counts() self.assertEqual(res.name, exp.name) self.assert_eq(res, exp) - self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - kser.value_counts(normalize=True, dropna=False), + psser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.value_counts(ascending=True, dropna=False), + psser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) with self.assertRaisesRegex( NotImplementedError, "value_counts currently does not support bins" ): - kser.value_counts(bins=3) + psser.value_counts(bins=3) pser.name = "index" - kser.name = "index" - self.assert_eq(kser.value_counts(), pser.value_counts()) + psser.name = "index" + self.assert_eq(psser.value_counts(), pser.value_counts()) # Series from DataFrame pdf = pd.DataFrame({"a": [2, 2, 3], "b": [None, 1, None]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(kdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True)) - self.assert_eq(kdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True)) + self.assert_eq(psdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True)) + self.assert_eq(psdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True)) self.assert_eq( - kdf.a.value_counts(normalize=True, dropna=False), + psdf.a.value_counts(normalize=True, dropna=False), pdf.a.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kdf.a.value_counts(ascending=True, dropna=False), + psdf.a.value_counts(ascending=True, dropna=False), pdf.a.value_counts(ascending=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) # Series with NaN index pser = pd.Series([3, 2, 3, 1, 2, 3], index=[2.0, None, 5.0, 5.0, None, 5.0]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - kser.value_counts(normalize=True, dropna=False), + psser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.value_counts(ascending=True, dropna=False), + psser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) @@ -733,37 +733,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - kser.value_counts(normalize=True, dropna=False), + psser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.value_counts(ascending=True, dropna=False), + psser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - kser.index.value_counts(normalize=True), + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True), + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -772,37 +772,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", None), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - kser.value_counts(normalize=True, dropna=False), + psser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.value_counts(ascending=True, dropna=False), + psser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - kser.index.value_counts(normalize=True), + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True), + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -813,37 +813,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), None, ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - kser.value_counts(normalize=True, dropna=False), + psser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - kser.value_counts(ascending=True, dropna=False), + psser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - kser.index.value_counts(normalize=True), + psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True), + psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - kser.index.value_counts(normalize=True, dropna=False), + psser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - kser.index.value_counts(ascending=True, dropna=False), + psser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -851,31 +851,31 @@ def test_value_counts(self): def test_nsmallest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] pser = pd.Series(sample_lst, name="x") - kser = ps.Series(sample_lst, name="x") - self.assert_eq(kser.nsmallest(n=3), pser.nsmallest(n=3)) - self.assert_eq(kser.nsmallest(), pser.nsmallest()) - self.assert_eq((kser + 1).nsmallest(), (pser + 1).nsmallest()) + psser = ps.Series(sample_lst, name="x") + self.assert_eq(psser.nsmallest(n=3), pser.nsmallest(n=3)) + self.assert_eq(psser.nsmallest(), pser.nsmallest()) + self.assert_eq((psser + 1).nsmallest(), (pser + 1).nsmallest()) def test_nlargest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] pser = pd.Series(sample_lst, name="x") - kser = ps.Series(sample_lst, name="x") - self.assert_eq(kser.nlargest(n=3), pser.nlargest(n=3)) - self.assert_eq(kser.nlargest(), pser.nlargest()) - self.assert_eq((kser + 1).nlargest(), (pser + 1).nlargest()) + psser = ps.Series(sample_lst, name="x") + self.assert_eq(psser.nlargest(n=3), pser.nlargest(n=3)) + self.assert_eq(psser.nlargest(), pser.nlargest()) + self.assert_eq((psser + 1).nlargest(), (pser + 1).nlargest()) def test_isnull(self): pser = pd.Series([1, 2, 3, 4, np.nan, 6], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.notnull(), pser.notnull()) - self.assert_eq(kser.isnull(), pser.isnull()) + self.assert_eq(psser.notnull(), pser.notnull()) + self.assert_eq(psser.isnull(), pser.isnull()) pser = self.pser - kser = self.kser + psser = self.psser - self.assert_eq(kser.notnull(), pser.notnull()) - self.assert_eq(kser.isnull(), pser.isnull()) + self.assert_eq(psser.notnull(), pser.notnull()) + self.assert_eq(psser.isnull(), pser.isnull()) def test_all(self): for pser in [ @@ -888,18 +888,18 @@ def test_all(self): pd.Series([], name="x"), pd.Series([np.nan], name="x"), ]: - kser = ps.from_pandas(pser) - self.assert_eq(kser.all(), pser.all()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.all(), pser.all()) pser = pd.Series([1, 2, 3, 4], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq((kser % 2 == 0).all(), (pser % 2 == 0).all()) + self.assert_eq((psser % 2 == 0).all(), (pser % 2 == 0).all()) with self.assertRaisesRegex( NotImplementedError, 'axis should be either 0 or "index" currently.' ): - kser.all(axis=1) + psser.all(axis=1) def test_any(self): for pser in [ @@ -912,151 +912,153 @@ def test_any(self): pd.Series([], name="x"), pd.Series([np.nan], name="x"), ]: - kser = ps.from_pandas(pser) - self.assert_eq(kser.any(), pser.any()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.any(), pser.any()) pser = pd.Series([1, 2, 3, 4], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq((kser % 2 == 0).any(), (pser % 2 == 0).any()) + self.assert_eq((psser % 2 == 0).any(), (pser % 2 == 0).any()) with self.assertRaisesRegex( NotImplementedError, 'axis should be either 0 or "index" currently.' ): - kser.any(axis=1) + psser.any(axis=1) def test_reset_index(self): pdf = pd.DataFrame({"foo": [1, 2, 3, 4]}, index=pd.Index(["a", "b", "c", "d"], name="idx")) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.foo - kser = kdf.foo + psser = psdf.foo - self.assert_eq(kser.reset_index(), pser.reset_index()) - self.assert_eq(kser.reset_index(name="values"), pser.reset_index(name="values")) - self.assert_eq(kser.reset_index(drop=True), pser.reset_index(drop=True)) + self.assert_eq(psser.reset_index(), pser.reset_index()) + self.assert_eq(psser.reset_index(name="values"), pser.reset_index(name="values")) + self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True)) # inplace - kser.reset_index(drop=True, inplace=True) + psser.reset_index(drop=True, inplace=True) pser.reset_index(drop=True, inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) def test_reset_index_with_default_index_types(self): pser = pd.Series([1, 2, 3], name="0", index=np.random.rand(3)) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) with ps.option_context("compute.default_index_type", "sequence"): - self.assert_eq(kser.reset_index(), pser.reset_index()) + self.assert_eq(psser.reset_index(), pser.reset_index()) with ps.option_context("compute.default_index_type", "distributed-sequence"): # the order might be changed. - self.assert_eq(kser.reset_index().sort_index(), pser.reset_index()) + self.assert_eq(psser.reset_index().sort_index(), pser.reset_index()) with ps.option_context("compute.default_index_type", "distributed"): # the index is different. self.assert_eq( - kser.reset_index().to_pandas().reset_index(drop=True), pser.reset_index() + psser.reset_index().to_pandas().reset_index(drop=True), pser.reset_index() ) def test_index_to_series_reset_index(self): - def check(kser, pser): - self.assert_eq(kser.reset_index(), pser.reset_index()) - self.assert_eq(kser.reset_index(drop=True), pser.reset_index(drop=True)) + def check(psser, pser): + self.assert_eq(psser.reset_index(), pser.reset_index()) + self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True)) pser.reset_index(drop=True, inplace=True) - kser.reset_index(drop=True, inplace=True) - self.assert_eq(kser, pser) + psser.reset_index(drop=True, inplace=True) + self.assert_eq(psser, pser) pdf = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, index=np.random.rand(9), ) - kdf = ps.from_pandas(pdf) - check(kdf.index.to_series(), pdf.index.to_series()) - check(kdf.index.to_series(name="a"), pdf.index.to_series(name="a")) - check(kdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a"))) + psdf = ps.from_pandas(pdf) + check(psdf.index.to_series(), pdf.index.to_series()) + check(psdf.index.to_series(name="a"), pdf.index.to_series(name="a")) + check(psdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a"))) def test_sort_values(self): pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.sort_values(), pser.sort_values()) - self.assert_eq(kser.sort_values(ascending=False), pser.sort_values(ascending=False)) - self.assert_eq(kser.sort_values(na_position="first"), pser.sort_values(na_position="first")) + self.assert_eq(psser.sort_values(), pser.sort_values()) + self.assert_eq(psser.sort_values(ascending=False), pser.sort_values(ascending=False)) + self.assert_eq( + psser.sort_values(na_position="first"), pser.sort_values(na_position="first") + ) - self.assertRaises(ValueError, lambda: kser.sort_values(na_position="invalid")) + self.assertRaises(ValueError, lambda: psser.sort_values(na_position="invalid")) # inplace # pandas raises an exception when the Series is derived from DataFrame - kser.sort_values(inplace=True) - self.assert_eq(kser, pser.sort_values()) - self.assert_eq(kdf, pdf) + psser.sort_values(inplace=True) + self.assert_eq(psser, pser.sort_values()) + self.assert_eq(psdf, pdf) pser = pdf.x.copy() - kser = kdf.x.copy() + psser = psdf.x.copy() - kser.sort_values(inplace=True) + psser.sort_values(inplace=True) pser.sort_values(inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) def test_sort_index(self): pdf = pd.DataFrame({"x": [2, 1, np.nan]}, index=["b", "a", np.nan]) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x # Assert invalid parameters - self.assertRaises(NotImplementedError, lambda: kser.sort_index(axis=1)) - self.assertRaises(NotImplementedError, lambda: kser.sort_index(kind="mergesort")) - self.assertRaises(ValueError, lambda: kser.sort_index(na_position="invalid")) + self.assertRaises(NotImplementedError, lambda: psser.sort_index(axis=1)) + self.assertRaises(NotImplementedError, lambda: psser.sort_index(kind="mergesort")) + self.assertRaises(ValueError, lambda: psser.sort_index(na_position="invalid")) # Assert default behavior without parameters - self.assert_eq(kser.sort_index(), pser.sort_index()) + self.assert_eq(psser.sort_index(), pser.sort_index()) # Assert sorting descending - self.assert_eq(kser.sort_index(ascending=False), pser.sort_index(ascending=False)) + self.assert_eq(psser.sort_index(ascending=False), pser.sort_index(ascending=False)) # Assert sorting NA indices first - self.assert_eq(kser.sort_index(na_position="first"), pser.sort_index(na_position="first")) + self.assert_eq(psser.sort_index(na_position="first"), pser.sort_index(na_position="first")) # Assert sorting inplace # pandas sorts pdf.x by the index and update the column only # when the Series is derived from DataFrame. - kser.sort_index(inplace=True) - self.assert_eq(kser, pser.sort_index()) - self.assert_eq(kdf, pdf) + psser.sort_index(inplace=True) + self.assert_eq(psser, pser.sort_index()) + self.assert_eq(psdf, pdf) pser = pdf.x.copy() - kser = kdf.x.copy() + psser = psdf.x.copy() - kser.sort_index(inplace=True) + psser.sort_index(inplace=True) pser.sort_index(inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) # Assert multi-indices pser = pd.Series(range(4), index=[["b", "b", "a", "a"], [1, 0, 1, 0]], name="0") - kser = ps.from_pandas(pser) - self.assert_eq(kser.sort_index(), pser.sort_index()) - self.assert_eq(kser.sort_index(level=[1, 0]), pser.sort_index(level=[1, 0])) + psser = ps.from_pandas(pser) + self.assert_eq(psser.sort_index(), pser.sort_index()) + self.assert_eq(psser.sort_index(level=[1, 0]), pser.sort_index(level=[1, 0])) - self.assert_eq(kser.reset_index().sort_index(), pser.reset_index().sort_index()) + self.assert_eq(psser.reset_index().sort_index(), pser.reset_index().sort_index()) def test_to_datetime(self): pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pd.to_datetime(pser, infer_datetime_format=True), - ps.to_datetime(kser, infer_datetime_format=True), + ps.to_datetime(psser, infer_datetime_format=True), ) def test_missing(self): - kser = self.kser + psser = self.psser missing_functions = inspect.getmembers(MissingPandasLikeSeries, inspect.isfunction) unsupported_functions = [ @@ -1067,7 +1069,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kser, name)() + getattr(psser, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -1076,7 +1078,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Series.*{}.*is deprecated".format(name) ): - getattr(kser, name)() + getattr(psser, name)() missing_properties = inspect.getmembers( MissingPandasLikeSeries, lambda o: isinstance(o, property) @@ -1091,7 +1093,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(kser, name) + getattr(psser, name) deprecated_properties = [ name for (name, type_) in missing_properties @@ -1101,267 +1103,271 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Series.*{}.*is deprecated".format(name) ): - getattr(kser, name) + getattr(psser, name) def test_clip(self): pser = pd.Series([0, 2, 4], index=np.random.rand(3)) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(ValueError, msg=msg): - kser.clip(lower=[1]) - with self.assertRaises(ValueError, msg=msg): - kser.clip(upper=[1]) + with self.assertRaises(TypeError, msg=msg): + psser.clip(lower=[1]) + with self.assertRaises(TypeError, msg=msg): + psser.clip(upper=[1]) # Assert no lower or upper - self.assert_eq(kser.clip(), pser.clip()) + self.assert_eq(psser.clip(), pser.clip()) # Assert lower only - self.assert_eq(kser.clip(1), pser.clip(1)) + self.assert_eq(psser.clip(1), pser.clip(1)) # Assert upper only - self.assert_eq(kser.clip(upper=3), pser.clip(upper=3)) + self.assert_eq(psser.clip(upper=3), pser.clip(upper=3)) # Assert lower and upper - self.assert_eq(kser.clip(1, 3), pser.clip(1, 3)) + self.assert_eq(psser.clip(1, 3), pser.clip(1, 3)) # Assert behavior on string values - str_kser = ps.Series(["a", "b", "c"]) - self.assert_eq(str_kser.clip(1, 3), str_kser) + str_psser = ps.Series(["a", "b", "c"]) + self.assert_eq(str_psser.clip(1, 3), str_psser) def test_compare(self): if LooseVersion(pd.__version__) >= LooseVersion("1.1"): pser = pd.Series([1, 2]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - res_kdf = kser.compare(kser) - self.assertTrue(res_kdf.empty) - self.assert_eq(res_kdf.columns, pd.Index(["self", "other"])) + res_psdf = psser.compare(psser) + self.assertTrue(res_psdf.empty) + self.assert_eq(res_psdf.columns, pd.Index(["self", "other"])) - self.assert_eq(pser.compare(pser + 1).sort_index(), kser.compare(kser + 1).sort_index()) + self.assert_eq( + pser.compare(pser + 1).sort_index(), psser.compare(psser + 1).sort_index() + ) pser = pd.Series([1, 2], index=["x", "y"]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.compare(pser + 1).sort_index(), kser.compare(kser + 1).sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq( + pser.compare(pser + 1).sort_index(), psser.compare(psser + 1).sort_index() + ) else: - kser = ps.Series([1, 2]) - res_kdf = kser.compare(kser) - self.assertTrue(res_kdf.empty) - self.assert_eq(res_kdf.columns, pd.Index(["self", "other"])) + psser = ps.Series([1, 2]) + res_psdf = psser.compare(psser) + self.assertTrue(res_psdf.empty) + self.assert_eq(res_psdf.columns, pd.Index(["self", "other"])) expected = ps.DataFrame([[1, 2], [2, 3]], columns=["self", "other"]) - self.assert_eq(expected, kser.compare(kser + 1).sort_index()) + self.assert_eq(expected, psser.compare(psser + 1).sort_index()) - kser = ps.Series([1, 2], index=["x", "y"]) + psser = ps.Series([1, 2], index=["x", "y"]) expected = ps.DataFrame([[1, 2], [2, 3]], index=["x", "y"], columns=["self", "other"]) - self.assert_eq(expected, kser.compare(kser + 1).sort_index()) + self.assert_eq(expected, psser.compare(psser + 1).sort_index()) def test_is_unique(self): # We can't use pandas' is_unique for comparison. pandas 0.23 ignores None pser = pd.Series([1, 2, 2, None, None]) - kser = ps.from_pandas(pser) - self.assertEqual(False, kser.is_unique) - self.assertEqual(False, (kser + 1).is_unique) + psser = ps.from_pandas(pser) + self.assertEqual(False, psser.is_unique) + self.assertEqual(False, (psser + 1).is_unique) pser = pd.Series([1, None, None]) - kser = ps.from_pandas(pser) - self.assertEqual(False, kser.is_unique) - self.assertEqual(False, (kser + 1).is_unique) + psser = ps.from_pandas(pser) + self.assertEqual(False, psser.is_unique) + self.assertEqual(False, (psser + 1).is_unique) pser = pd.Series([1]) - kser = ps.from_pandas(pser) - self.assertEqual(pser.is_unique, kser.is_unique) - self.assertEqual((pser + 1).is_unique, (kser + 1).is_unique) + psser = ps.from_pandas(pser) + self.assertEqual(pser.is_unique, psser.is_unique) + self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique) pser = pd.Series([1, 1, 1]) - kser = ps.from_pandas(pser) - self.assertEqual(pser.is_unique, kser.is_unique) - self.assertEqual((pser + 1).is_unique, (kser + 1).is_unique) + psser = ps.from_pandas(pser) + self.assertEqual(pser.is_unique, psser.is_unique) + self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique) def test_to_list(self): - self.assert_eq(self.kser.tolist(), self.pser.tolist()) + self.assert_eq(self.psser.tolist(), self.pser.tolist()) def test_append(self): pser1 = pd.Series([1, 2, 3], name="0") pser2 = pd.Series([4, 5, 6], name="0") pser3 = pd.Series([4, 5, 6], index=[3, 4, 5], name="0") - kser1 = ps.from_pandas(pser1) - kser2 = ps.from_pandas(pser2) - kser3 = ps.from_pandas(pser3) + psser1 = ps.from_pandas(pser1) + psser2 = ps.from_pandas(pser2) + psser3 = ps.from_pandas(pser3) - self.assert_eq(kser1.append(kser2), pser1.append(pser2)) - self.assert_eq(kser1.append(kser3), pser1.append(pser3)) + self.assert_eq(psser1.append(psser2), pser1.append(pser2)) + self.assert_eq(psser1.append(psser3), pser1.append(pser3)) self.assert_eq( - kser1.append(kser2, ignore_index=True), pser1.append(pser2, ignore_index=True) + psser1.append(psser2, ignore_index=True), pser1.append(pser2, ignore_index=True) ) - kser1.append(kser3, verify_integrity=True) + psser1.append(psser3, verify_integrity=True) msg = "Indices have overlapping values" with self.assertRaises(ValueError, msg=msg): - kser1.append(kser2, verify_integrity=True) + psser1.append(psser2, verify_integrity=True) def test_map(self): pser = pd.Series(["cat", "dog", None, "rabbit"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) # Currently Koalas doesn't return NaN as pandas does. - self.assert_eq(kser.map({}), pser.map({}).replace({pd.np.nan: None})) + self.assert_eq(psser.map({}), pser.map({}).replace({pd.np.nan: None})) d = defaultdict(lambda: "abc") - self.assertTrue("abc" in repr(kser.map(d))) - self.assert_eq(kser.map(d), pser.map(d)) + self.assertTrue("abc" in repr(psser.map(d))) + self.assert_eq(psser.map(d), pser.map(d)) def tomorrow(date) -> datetime: return date + timedelta(days=1) pser = pd.Series([datetime(2019, 10, 24)]) - kser = ps.from_pandas(pser) - self.assert_eq(kser.map(tomorrow), pser.map(tomorrow)) + psser = ps.from_pandas(pser) + self.assert_eq(psser.map(tomorrow), pser.map(tomorrow)) def test_add_prefix(self): pser = pd.Series([1, 2, 3, 4], name="0") - kser = ps.from_pandas(pser) - self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.add_prefix("item_"), psser.add_prefix("item_")) pser = pd.Series( [1, 2, 3], name="0", index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.add_prefix("item_"), psser.add_prefix("item_")) def test_add_suffix(self): pser = pd.Series([1, 2, 3, 4], name="0") - kser = ps.from_pandas(pser) - self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.add_suffix("_item"), psser.add_suffix("_item")) pser = pd.Series( [1, 2, 3], name="0", index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.add_suffix("_item"), psser.add_suffix("_item")) def test_cummin(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cummin(), kser.cummin()) - self.assert_eq(pser.cummin(skipna=False), kser.cummin(skipna=False)) - self.assert_eq(pser.cummin().sum(), kser.cummin().sum()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cummin(), psser.cummin()) + self.assert_eq(pser.cummin(skipna=False), psser.cummin(skipna=False)) + self.assert_eq(pser.cummin().sum(), psser.cummin().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - kser = ps.from_pandas(pser) - self.assert_eq(pser.cummin(), kser.cummin()) - self.assert_eq(pser.cummin(skipna=False), kser.cummin(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cummin(), psser.cummin()) + self.assert_eq(pser.cummin(skipna=False), psser.cummin(skipna=False)) def test_cummax(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cummax(), kser.cummax()) - self.assert_eq(pser.cummax(skipna=False), kser.cummax(skipna=False)) - self.assert_eq(pser.cummax().sum(), kser.cummax().sum()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cummax(), psser.cummax()) + self.assert_eq(pser.cummax(skipna=False), psser.cummax(skipna=False)) + self.assert_eq(pser.cummax().sum(), psser.cummax().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - kser = ps.from_pandas(pser) - self.assert_eq(pser.cummax(), kser.cummax()) - self.assert_eq(pser.cummax(skipna=False), kser.cummax(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cummax(), psser.cummax()) + self.assert_eq(pser.cummax(skipna=False), psser.cummax(skipna=False)) def test_cumsum(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum(), kser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False), kser.cumsum(skipna=False)) - self.assert_eq(pser.cumsum().sum(), kser.cumsum().sum()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum(), psser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False), psser.cumsum(skipna=False)) + self.assert_eq(pser.cumsum().sum(), psser.cumsum().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum(), kser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False), kser.cumsum(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum(), psser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False), psser.cumsum(skipna=False)) # bool pser = pd.Series([True, True, False, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum().astype(int), kser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False).astype(int), kser.cumsum(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum().astype(int), psser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False).astype(int), psser.cumsum(skipna=False)) def test_cumprod(self): pser = pd.Series([1.0, None, 1.0, 4.0, 9.0]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) - self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) + self.assert_eq(pser.cumprod().sum(), psser.cumprod().sum()) # with integer type pser = pd.Series([1, 10, 1, 4, 9]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) - self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) + self.assert_eq(pser.cumprod().sum(), psser.cumprod().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) # including zero pser = pd.Series([1, 2, 0, 3]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) # including negative values pser = pd.Series([1, -1, -2]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) # bool pser = pd.Series([True, True, False, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), kser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False).astype(int), kser.cumprod(skipna=False)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), psser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False).astype(int), psser.cumprod(skipna=False)) def test_median(self): - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") def test_rank(self): pser = pd.Series([1, 2, 3, 1], name="x") - kser = ps.from_pandas(pser) - self.assert_eq(pser.rank(), kser.rank().sort_index()) - self.assert_eq(pser.rank().sum(), kser.rank().sum()) - self.assert_eq(pser.rank(ascending=False), kser.rank(ascending=False).sort_index()) - self.assert_eq(pser.rank(method="min"), kser.rank(method="min").sort_index()) - self.assert_eq(pser.rank(method="max"), kser.rank(method="max").sort_index()) - self.assert_eq(pser.rank(method="first"), kser.rank(method="first").sort_index()) - self.assert_eq(pser.rank(method="dense"), kser.rank(method="dense").sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.rank(), psser.rank().sort_index()) + self.assert_eq(pser.rank().sum(), psser.rank().sum()) + self.assert_eq(pser.rank(ascending=False), psser.rank(ascending=False).sort_index()) + self.assert_eq(pser.rank(method="min"), psser.rank(method="min").sort_index()) + self.assert_eq(pser.rank(method="max"), psser.rank(method="max").sort_index()) + self.assert_eq(pser.rank(method="first"), psser.rank(method="first").sort_index()) + self.assert_eq(pser.rank(method="dense"), psser.rank(method="dense").sort_index()) msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" with self.assertRaisesRegex(ValueError, msg): - kser.rank(method="nothing") + psser.rank(method="nothing") def test_round(self): pser = pd.Series([0.028208, 0.038683, 0.877076], name="x") - kser = ps.from_pandas(pser) - self.assert_eq(pser.round(2), kser.round(2)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.round(2), psser.round(2)) msg = "decimals must be an integer" - with self.assertRaisesRegex(ValueError, msg): - kser.round(1.5) + with self.assertRaisesRegex(TypeError, msg): + psser.round(1.5) def test_quantile(self): pser = pd.Series([]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.quantile(0.5), pser.quantile(0.5)) - self.assert_eq(kser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) + self.assert_eq(psser.quantile(0.5), pser.quantile(0.5)) + self.assert_eq(psser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): @@ -1371,149 +1377,151 @@ def test_quantile(self): def test_idxmax(self): pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmax(), pser.idxmax()) - self.assertEqual(kser.idxmax(skipna=False), pser.idxmax(skipna=False)) + self.assertEqual(psser.idxmax(), pser.idxmax()) + self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False)) index = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") ) pser = pd.Series(data=[1, 2, 4, 5], index=index) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmax(), pser.idxmax()) - self.assertEqual(kser.idxmax(skipna=False), pser.idxmax(skipna=False)) + self.assertEqual(psser.idxmax(), pser.idxmax()) + self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False)) - kser = ps.Series([]) + psser = ps.Series([]) with self.assertRaisesRegex(ValueError, "an empty sequence"): - kser.idxmax() + psser.idxmax() pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8]) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmax(), pser.idxmax()) - self.assertEqual(repr(kser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False))) + self.assertEqual(psser.idxmax(), pser.idxmax()) + self.assertEqual(repr(psser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False))) def test_idxmin(self): pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmin(), pser.idxmin()) - self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) + self.assertEqual(psser.idxmin(), pser.idxmin()) + self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False)) index = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") ) pser = pd.Series(data=[1, 2, 4, 5], index=index) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmin(), pser.idxmin()) - self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) + self.assertEqual(psser.idxmin(), pser.idxmin()) + self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False)) - kser = ps.Series([]) + psser = ps.Series([]) with self.assertRaisesRegex(ValueError, "an empty sequence"): - kser.idxmin() + psser.idxmin() pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8]) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assertEqual(kser.idxmin(), pser.idxmin()) - self.assertEqual(repr(kser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False))) + self.assertEqual(psser.idxmin(), pser.idxmin()) + self.assertEqual(repr(psser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False))) def test_shift(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.shift(2), pser.shift(2)) - self.assert_eq(kser.shift().shift(-1), pser.shift().shift(-1)) - self.assert_eq(kser.shift().sum(), pser.shift().sum()) + self.assert_eq(psser.shift(2), pser.shift(2)) + self.assert_eq(psser.shift().shift(-1), pser.shift().shift(-1)) + self.assert_eq(psser.shift().sum(), pser.shift().sum()) if LooseVersion(pd.__version__) < LooseVersion("0.24.2"): - self.assert_eq(kser.shift(periods=2), pser.shift(periods=2)) + self.assert_eq(psser.shift(periods=2), pser.shift(periods=2)) else: - self.assert_eq(kser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0)) - with self.assertRaisesRegex(ValueError, "periods should be an int; however"): - kser.shift(periods=1.5) + self.assert_eq( + psser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0) + ) + with self.assertRaisesRegex(TypeError, "periods should be an int; however"): + psser.shift(periods=1.5) def test_diff(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.diff(2), pser.diff(2)) - self.assert_eq(kser.diff().diff(-1), pser.diff().diff(-1)) - self.assert_eq(kser.diff().sum(), pser.diff().sum()) + self.assert_eq(psser.diff(2), pser.diff(2)) + self.assert_eq(psser.diff().diff(-1), pser.diff().diff(-1)) + self.assert_eq(psser.diff().sum(), pser.diff().sum()) def _test_numeric_astype(self, pser): - kser = ps.Series(pser) - - self.assert_eq(kser.astype(int), pser.astype(int)) - self.assert_eq(kser.astype(np.int), pser.astype(np.int)) - self.assert_eq(kser.astype(np.int8), pser.astype(np.int8)) - self.assert_eq(kser.astype(np.int16), pser.astype(np.int16)) - self.assert_eq(kser.astype(np.int32), pser.astype(np.int32)) - self.assert_eq(kser.astype(np.int64), pser.astype(np.int64)) - self.assert_eq(kser.astype(np.byte), pser.astype(np.byte)) - self.assert_eq(kser.astype("int"), pser.astype("int")) - self.assert_eq(kser.astype("int8"), pser.astype("int8")) - self.assert_eq(kser.astype("int16"), pser.astype("int16")) - self.assert_eq(kser.astype("int32"), pser.astype("int32")) - self.assert_eq(kser.astype("int64"), pser.astype("int64")) - self.assert_eq(kser.astype("b"), pser.astype("b")) - self.assert_eq(kser.astype("byte"), pser.astype("byte")) - self.assert_eq(kser.astype("i"), pser.astype("i")) - self.assert_eq(kser.astype("long"), pser.astype("long")) - self.assert_eq(kser.astype("short"), pser.astype("short")) - self.assert_eq(kser.astype(np.float), pser.astype(np.float)) - self.assert_eq(kser.astype(np.float32), pser.astype(np.float32)) - self.assert_eq(kser.astype(np.float64), pser.astype(np.float64)) - self.assert_eq(kser.astype("float"), pser.astype("float")) - self.assert_eq(kser.astype("float32"), pser.astype("float32")) - self.assert_eq(kser.astype("float64"), pser.astype("float64")) - self.assert_eq(kser.astype("double"), pser.astype("double")) - self.assert_eq(kser.astype("f"), pser.astype("f")) - self.assert_eq(kser.astype(bool), pser.astype(bool)) - self.assert_eq(kser.astype("bool"), pser.astype("bool")) - self.assert_eq(kser.astype("?"), pser.astype("?")) - self.assert_eq(kser.astype(np.unicode_), pser.astype(np.unicode_)) - self.assert_eq(kser.astype("str"), pser.astype("str")) - self.assert_eq(kser.astype("U"), pser.astype("U")) + psser = ps.Series(pser) + + self.assert_eq(psser.astype(int), pser.astype(int)) + self.assert_eq(psser.astype(np.int), pser.astype(np.int)) + self.assert_eq(psser.astype(np.int8), pser.astype(np.int8)) + self.assert_eq(psser.astype(np.int16), pser.astype(np.int16)) + self.assert_eq(psser.astype(np.int32), pser.astype(np.int32)) + self.assert_eq(psser.astype(np.int64), pser.astype(np.int64)) + self.assert_eq(psser.astype(np.byte), pser.astype(np.byte)) + self.assert_eq(psser.astype("int"), pser.astype("int")) + self.assert_eq(psser.astype("int8"), pser.astype("int8")) + self.assert_eq(psser.astype("int16"), pser.astype("int16")) + self.assert_eq(psser.astype("int32"), pser.astype("int32")) + self.assert_eq(psser.astype("int64"), pser.astype("int64")) + self.assert_eq(psser.astype("b"), pser.astype("b")) + self.assert_eq(psser.astype("byte"), pser.astype("byte")) + self.assert_eq(psser.astype("i"), pser.astype("i")) + self.assert_eq(psser.astype("long"), pser.astype("long")) + self.assert_eq(psser.astype("short"), pser.astype("short")) + self.assert_eq(psser.astype(np.float), pser.astype(np.float)) + self.assert_eq(psser.astype(np.float32), pser.astype(np.float32)) + self.assert_eq(psser.astype(np.float64), pser.astype(np.float64)) + self.assert_eq(psser.astype("float"), pser.astype("float")) + self.assert_eq(psser.astype("float32"), pser.astype("float32")) + self.assert_eq(psser.astype("float64"), pser.astype("float64")) + self.assert_eq(psser.astype("double"), pser.astype("double")) + self.assert_eq(psser.astype("f"), pser.astype("f")) + self.assert_eq(psser.astype(bool), pser.astype(bool)) + self.assert_eq(psser.astype("bool"), pser.astype("bool")) + self.assert_eq(psser.astype("?"), pser.astype("?")) + self.assert_eq(psser.astype(np.unicode_), pser.astype(np.unicode_)) + self.assert_eq(psser.astype("str"), pser.astype("str")) + self.assert_eq(psser.astype("U"), pser.astype("U")) if extension_dtypes_available: from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype - self._check_extension(kser.astype("Int8"), pser.astype("Int8")) - self._check_extension(kser.astype("Int16"), pser.astype("Int16")) - self._check_extension(kser.astype("Int32"), pser.astype("Int32")) - self._check_extension(kser.astype("Int64"), pser.astype("Int64")) - self._check_extension(kser.astype(Int8Dtype()), pser.astype(Int8Dtype())) - self._check_extension(kser.astype(Int16Dtype()), pser.astype(Int16Dtype())) - self._check_extension(kser.astype(Int32Dtype()), pser.astype(Int32Dtype())) - self._check_extension(kser.astype(Int64Dtype()), pser.astype(Int64Dtype())) + self._check_extension(psser.astype("Int8"), pser.astype("Int8")) + self._check_extension(psser.astype("Int16"), pser.astype("Int16")) + self._check_extension(psser.astype("Int32"), pser.astype("Int32")) + self._check_extension(psser.astype("Int64"), pser.astype("Int64")) + self._check_extension(psser.astype(Int8Dtype()), pser.astype(Int8Dtype())) + self._check_extension(psser.astype(Int16Dtype()), pser.astype(Int16Dtype())) + self._check_extension(psser.astype(Int32Dtype()), pser.astype(Int32Dtype())) + self._check_extension(psser.astype(Int64Dtype()), pser.astype(Int64Dtype())) if extension_object_dtypes_available: from pandas import StringDtype if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self._check_extension(kser.astype("string"), pser.astype("string")) - self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(psser.astype("string"), pser.astype("string")) + self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) else: self._check_extension( - kser.astype("string"), + psser.astype("string"), pd.Series(["10", "20", "15", "30", "45"], name="x", dtype="string"), ) self._check_extension( - kser.astype(StringDtype()), + psser.astype(StringDtype()), pd.Series(["10", "20", "15", "30", "45"], name="x", dtype=StringDtype()), ) if extension_float_dtypes_available: from pandas import Float32Dtype, Float64Dtype - self._check_extension(kser.astype("Float32"), pser.astype("Float32")) - self._check_extension(kser.astype("Float64"), pser.astype("Float64")) - self._check_extension(kser.astype(Float32Dtype()), pser.astype(Float32Dtype())) - self._check_extension(kser.astype(Float64Dtype()), pser.astype(Float64Dtype())) + self._check_extension(psser.astype("Float32"), pser.astype("Float32")) + self._check_extension(psser.astype("Float64"), pser.astype("Float64")) + self._check_extension(psser.astype(Float32Dtype()), pser.astype(Float32Dtype())) + self._check_extension(psser.astype(Float64Dtype()), pser.astype(Float64Dtype())) def test_astype(self): psers = [pd.Series([10, 20, 15, 30, 45], name="x")] @@ -1527,100 +1535,101 @@ def test_astype(self): self._test_numeric_astype(pser) pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.astype(bool), pser.astype(bool)) - self.assert_eq(kser.astype(str), pser.astype(str)) + self.assert_eq(psser.astype(bool), pser.astype(bool)) + self.assert_eq(psser.astype(str), pser.astype(str)) pser = pd.Series(["hi", "hi ", " ", " \t", "", None], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.astype(bool), pser.astype(bool)) + self.assert_eq(psser.astype(bool), pser.astype(bool)) if LooseVersion("1.1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.1.4"): # a pandas bug: https://github.com/databricks/koalas/pull/1818#issuecomment-703961980 - self.assert_eq(kser.astype(str).tolist(), ["hi", "hi ", " ", " \t", "", "None"]) + self.assert_eq(psser.astype(str).tolist(), ["hi", "hi ", " ", " \t", "", "None"]) else: - self.assert_eq(kser.astype(str), pser.astype(str)) - self.assert_eq(kser.str.strip().astype(bool), pser.str.strip().astype(bool)) + self.assert_eq(psser.astype(str), pser.astype(str)) + self.assert_eq(psser.str.strip().astype(bool), pser.str.strip().astype(bool)) if extension_object_dtypes_available: from pandas import StringDtype - self._check_extension(kser.astype("string"), pser.astype("string")) - self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(psser.astype("string"), pser.astype("string")) + self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) pser = pd.Series([True, False, None], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.astype(bool), pser.astype(bool)) - # Comment out the below test cause because pandas returns `None` or `nan` randomly - # self.assert_eq(kser.astype(str), pser.astype(str)) + self.assert_eq(psser.astype(bool), pser.astype(bool)) + self.assert_eq(psser.astype(str), pser.astype(str)) if extension_object_dtypes_available: from pandas import BooleanDtype, StringDtype - self._check_extension(kser.astype("boolean"), pser.astype("boolean")) - self._check_extension(kser.astype(BooleanDtype()), pser.astype(BooleanDtype())) + self._check_extension(psser.astype("boolean"), pser.astype("boolean")) + self._check_extension(psser.astype(BooleanDtype()), pser.astype(BooleanDtype())) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self._check_extension(kser.astype("string"), pser.astype("string")) - self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(psser.astype("string"), pser.astype("string")) + self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) else: self._check_extension( - kser.astype("string"), + psser.astype("string"), pd.Series(["True", "False", None], name="x", dtype="string"), ) self._check_extension( - kser.astype(StringDtype()), + psser.astype(StringDtype()), pd.Series(["True", "False", None], name="x", dtype=StringDtype()), ) pser = pd.Series(["2020-10-27 00:00:01", None], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.astype(np.datetime64), pser.astype(np.datetime64)) - self.assert_eq(kser.astype("datetime64[ns]"), pser.astype("datetime64[ns]")) - self.assert_eq(kser.astype("M"), pser.astype("M")) - self.assert_eq(kser.astype("M").astype(str), pser.astype("M").astype(str)) + self.assert_eq(psser.astype(np.datetime64), pser.astype(np.datetime64)) + self.assert_eq(psser.astype("datetime64[ns]"), pser.astype("datetime64[ns]")) + self.assert_eq(psser.astype("M"), pser.astype("M")) + self.assert_eq(psser.astype("M").astype(str), pser.astype("M").astype(str)) # Comment out the below test cause because pandas returns `NaT` or `nan` randomly - # self.assert_eq(kser.astype("M").dt.date.astype(str), pser.astype("M").dt.date.astype(str)) + # self.assert_eq( + # psser.astype("M").dt.date.astype(str), pser.astype("M").dt.date.astype(str) + # ) if extension_object_dtypes_available: from pandas import StringDtype self._check_extension( - kser.astype("M").astype("string"), pser.astype("M").astype("string") + psser.astype("M").astype("string"), pser.astype("M").astype("string") ) self._check_extension( - kser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) + psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) ) with self.assertRaisesRegex(TypeError, "not understood"): - kser.astype("int63") + psser.astype("int63") def test_aggregate(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) msg = "func must be a string or list of strings" - with self.assertRaisesRegex(ValueError, msg): - kser.aggregate({"x": ["min", "max"]}) + with self.assertRaisesRegex(TypeError, msg): + psser.aggregate({"x": ["min", "max"]}) msg = ( "If the given function is a list, it " "should only contains function names as strings." ) with self.assertRaisesRegex(ValueError, msg): - kser.aggregate(["min", max]) + psser.aggregate(["min", max]) def test_drop(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.drop(1), pser.drop(1)) - self.assert_eq(kser.drop([1, 4]), pser.drop([1, 4])) + self.assert_eq(psser.drop(1), pser.drop(1)) + self.assert_eq(psser.drop([1, 4]), pser.drop([1, 4])) msg = "Need to specify at least one of 'labels' or 'index'" with self.assertRaisesRegex(ValueError, msg): - kser.drop() - self.assertRaises(KeyError, lambda: kser.drop((0, 1))) + psser.drop() + self.assertRaises(KeyError, lambda: psser.drop((0, 1))) # For MultiIndex midx = pd.MultiIndex( @@ -1628,20 +1637,20 @@ def test_drop(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.drop("lama"), pser.drop("lama")) - self.assert_eq(kser.drop(labels="weight", level=1), pser.drop(labels="weight", level=1)) - self.assert_eq(kser.drop(("lama", "weight")), pser.drop(("lama", "weight"))) + self.assert_eq(psser.drop("lama"), pser.drop("lama")) + self.assert_eq(psser.drop(labels="weight", level=1), pser.drop(labels="weight", level=1)) + self.assert_eq(psser.drop(("lama", "weight")), pser.drop(("lama", "weight"))) self.assert_eq( - kser.drop([("lama", "speed"), ("falcon", "weight")]), + psser.drop([("lama", "speed"), ("falcon", "weight")]), pser.drop([("lama", "speed"), ("falcon", "weight")]), ) - self.assert_eq(kser.drop({"lama": "speed"}), pser.drop({"lama": "speed"})) + self.assert_eq(psser.drop({"lama": "speed"}), pser.drop({"lama": "speed"})) msg = "'level' should be less than the number of indexes" with self.assertRaisesRegex(ValueError, msg): - kser.drop(labels="weight", level=2) + psser.drop(labels="weight", level=2) msg = ( "If the given index is a list, it " @@ -1649,15 +1658,15 @@ def test_drop(self): "that contain index names" ) with self.assertRaisesRegex(ValueError, msg): - kser.drop(["lama", ["cow", "falcon"]]) + psser.drop(["lama", ["cow", "falcon"]]) msg = "Cannot specify both 'labels' and 'index'" with self.assertRaisesRegex(ValueError, msg): - kser.drop("lama", index="cow") + psser.drop("lama", index="cow") msg = r"'Key length \(2\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - kser.drop(("lama", "speed", "x")) + psser.drop(("lama", "speed", "x")) def test_pop(self): midx = pd.MultiIndex( @@ -1665,41 +1674,41 @@ def test_pop(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pdf = pd.DataFrame({"x": [45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3]}, index=midx) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.pop(("lama", "speed")), pser.pop(("lama", "speed"))) - self.assert_eq(kser, pser) - self.assert_eq(kdf, pdf) + self.assert_eq(psser.pop(("lama", "speed")), pser.pop(("lama", "speed"))) + self.assert_eq(psser, pser) + self.assert_eq(psdf, pdf) msg = r"'Key length \(3\) exceeds index depth \(2\)'" with self.assertRaisesRegex(KeyError, msg): - kser.pop(("lama", "speed", "x")) + psser.pop(("lama", "speed", "x")) def test_replace(self): pser = pd.Series([10, 20, 15, 30, np.nan], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser.replace(), pser.replace()) - self.assert_eq(kser.replace({}), pser.replace({})) + self.assert_eq(psser.replace(), pser.replace()) + self.assert_eq(psser.replace({}), pser.replace({})) - self.assert_eq(kser.replace(np.nan, 45), pser.replace(np.nan, 45)) - self.assert_eq(kser.replace([10, 15], 45), pser.replace([10, 15], 45)) - self.assert_eq(kser.replace((10, 15), 45), pser.replace((10, 15), 45)) - self.assert_eq(kser.replace([10, 15], [45, 50]), pser.replace([10, 15], [45, 50])) - self.assert_eq(kser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) + self.assert_eq(psser.replace(np.nan, 45), pser.replace(np.nan, 45)) + self.assert_eq(psser.replace([10, 15], 45), pser.replace([10, 15], 45)) + self.assert_eq(psser.replace((10, 15), 45), pser.replace((10, 15), 45)) + self.assert_eq(psser.replace([10, 15], [45, 50]), pser.replace([10, 15], [45, 50])) + self.assert_eq(psser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) msg = "'to_replace' should be one of str, list, tuple, dict, int, float" - with self.assertRaisesRegex(ValueError, msg): - kser.replace(ps.range(5)) + with self.assertRaisesRegex(TypeError, msg): + psser.replace(ps.range(5)) msg = "Replacement lists must match in length. Expecting 3 got 2" with self.assertRaisesRegex(ValueError, msg): - kser.replace([10, 20, 30], [1, 2]) + psser.replace([10, 20, 30], [1, 2]) msg = "replace currently not support for regex" with self.assertRaisesRegex(NotImplementedError, msg): - kser.replace(r"^1.$", regex=True) + psser.replace(r"^1.$", regex=True) def test_xs(self): midx = pd.MultiIndex( @@ -1707,9 +1716,9 @@ def test_xs(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.xs(("a", "lama", "speed")), pser.xs(("a", "lama", "speed"))) + self.assert_eq(psser.xs(("a", "lama", "speed")), pser.xs(("a", "lama", "speed"))) def test_duplicates(self): psers = { @@ -1722,70 +1731,70 @@ def test_duplicates(self): for (msg, pser), keep in product(psers.items(), keeps): with self.subTest(msg, keep=keep): - kser = ps.Series(pser) + psser = ps.Series(pser) self.assert_eq( pser.drop_duplicates(keep=keep).sort_values(), - kser.drop_duplicates(keep=keep).sort_values(), + psser.drop_duplicates(keep=keep).sort_values(), ) def test_update(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - kser = ps.Series(pser) + psser = ps.Series(pser) msg = "'other' must be a Series" - with self.assertRaisesRegex(ValueError, msg): - kser.update(10) + with self.assertRaisesRegex(TypeError, msg): + psser.update(10) def test_where(self): pser1 = pd.Series([0, 1, 2, 3, 4]) - kser1 = ps.from_pandas(pser1) + psser1 = ps.from_pandas(pser1) - self.assert_eq(pser1.where(pser1 > 3), kser1.where(kser1 > 3).sort_index()) + self.assert_eq(pser1.where(pser1 > 3), psser1.where(psser1 > 3).sort_index()) def test_mask(self): pser1 = pd.Series([0, 1, 2, 3, 4]) - kser1 = ps.from_pandas(pser1) + psser1 = ps.from_pandas(pser1) - self.assert_eq(pser1.mask(pser1 > 3), kser1.mask(kser1 > 3).sort_index()) + self.assert_eq(pser1.mask(pser1 > 3), psser1.mask(psser1 > 3).sort_index()) def test_truncate(self): pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) - kser1 = ps.Series(pser1) + psser1 = ps.Series(pser1) pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1]) - kser2 = ps.Series(pser2) + psser2 = ps.Series(pser2) - self.assert_eq(kser1.truncate(), pser1.truncate()) - self.assert_eq(kser1.truncate(before=2), pser1.truncate(before=2)) - self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5)) - self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False)) - self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False)) + self.assert_eq(psser1.truncate(), pser1.truncate()) + self.assert_eq(psser1.truncate(before=2), pser1.truncate(before=2)) + self.assert_eq(psser1.truncate(after=5), pser1.truncate(after=5)) + self.assert_eq(psser1.truncate(copy=False), pser1.truncate(copy=False)) + self.assert_eq(psser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False)) # The bug for these tests has been fixed in pandas 1.1.0. if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6)) - self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False)) + self.assert_eq(psser2.truncate(4, 6), pser2.truncate(4, 6)) + self.assert_eq(psser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False)) else: - expected_kser = ps.Series([20, 30, 40], index=[6, 5, 4]) - self.assert_eq(kser2.truncate(4, 6), expected_kser) - self.assert_eq(kser2.truncate(4, 6, copy=False), expected_kser) + expected_psser = ps.Series([20, 30, 40], index=[6, 5, 4]) + self.assert_eq(psser2.truncate(4, 6), expected_psser) + self.assert_eq(psser2.truncate(4, 6, copy=False), expected_psser) - kser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1]) + psser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1]) msg = "truncate requires a sorted index" with self.assertRaisesRegex(ValueError, msg): - kser.truncate() + psser.truncate() - kser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) + psser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) msg = "Truncate: 2 must be after 5" with self.assertRaisesRegex(ValueError, msg): - kser.truncate(5, 2) + psser.truncate(5, 2) def test_getitem(self): pser = pd.Series([10, 20, 15, 30, 45], ["A", "A", "B", "C", "D"]) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser["A"], pser["A"]) - self.assert_eq(kser["B"], pser["B"]) - self.assert_eq(kser[kser > 15], pser[pser > 15]) + self.assert_eq(psser["A"], pser["A"]) + self.assert_eq(psser["B"], pser["B"]) + self.assert_eq(psser[psser > 15], pser[pser > 15]) # for MultiIndex midx = pd.MultiIndex( @@ -1793,15 +1802,15 @@ def test_getitem(self): [[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], name="0", index=midx) - kser = ps.Series(pser) + psser = ps.Series(pser) - self.assert_eq(kser["a"], pser["a"]) - self.assert_eq(kser["a", "lama"], pser["a", "lama"]) - self.assert_eq(kser[kser > 1.5], pser[pser > 1.5]) + self.assert_eq(psser["a"], pser["a"]) + self.assert_eq(psser["a", "lama"], pser["a", "lama"]) + self.assert_eq(psser[psser > 1.5], pser[pser > 1.5]) msg = r"'Key length \(4\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - kser[("a", "lama", "speed", "x")] + psser[("a", "lama", "speed", "x")] def test_keys(self): midx = pd.MultiIndex( @@ -1809,35 +1818,35 @@ def test_keys(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.keys(), pser.keys()) + self.assert_eq(psser.keys(), pser.keys()) def test_index(self): # to check setting name of Index properly. idx = pd.Index([1, 2, 3, 4, 5, 6, 7, 8, 9]) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - kser.name = "koalas" + psser.name = "koalas" pser.name = "koalas" - self.assert_eq(kser.index.name, pser.index.name) + self.assert_eq(psser.index.name, pser.index.name) # for check setting names of MultiIndex properly. - kser.names = ["hello", "koalas"] + psser.names = ["hello", "koalas"] pser.names = ["hello", "koalas"] - self.assert_eq(kser.index.names, pser.index.names) + self.assert_eq(psser.index.names, pser.index.names) def test_pct_change(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.pct_change(), pser.pct_change(), check_exact=False) - self.assert_eq(kser.pct_change().sum(), pser.pct_change().sum(), almost=True) - self.assert_eq(kser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) - self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) - self.assert_eq(kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) - self.assert_eq(kser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) + self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False) + self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True) + self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) + self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) + self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) + self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) # for MultiIndex midx = pd.MultiIndex( @@ -1845,19 +1854,19 @@ def test_pct_change(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.pct_change(), pser.pct_change(), check_exact=False) - self.assert_eq(kser.pct_change().sum(), pser.pct_change().sum(), almost=True) - self.assert_eq(kser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) - self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) - self.assert_eq(kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) - self.assert_eq(kser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) + self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False) + self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True) + self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) + self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) + self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) + self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) def test_axes(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) - kser = ps.from_pandas(pser) - self.assert_eq(kser.axes, pser.axes) + psser = ps.from_pandas(pser) + self.assert_eq(psser.axes, pser.axes) # for MultiIndex midx = pd.MultiIndex( @@ -1865,332 +1874,332 @@ def test_axes(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - kser = ps.from_pandas(pser) - self.assert_eq(kser.axes, pser.axes) + psser = ps.from_pandas(pser) + self.assert_eq(psser.axes, pser.axes) def test_udt(self): sparse_values = {0: 0.1, 1: 1.1} sparse_vector = SparseVector(len(sparse_values), sparse_values) pser = pd.Series([sparse_vector]) - kser = ps.from_pandas(pser) - self.assert_eq(kser, pser) + psser = ps.from_pandas(pser) + self.assert_eq(psser, pser) def test_repeat(self): pser = pd.Series(["a", "b", "c"], name="0", index=np.random.rand(3)) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.repeat(3).sort_index(), pser.repeat(3).sort_index()) - self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index()) + self.assert_eq(psser.repeat(3).sort_index(), pser.repeat(3).sort_index()) + self.assert_eq(psser.repeat(0).sort_index(), pser.repeat(0).sort_index()) - self.assertRaises(ValueError, lambda: kser.repeat(-1)) - self.assertRaises(ValueError, lambda: kser.repeat("abc")) + self.assertRaises(ValueError, lambda: psser.repeat(-1)) + self.assertRaises(TypeError, lambda: psser.repeat("abc")) pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3)) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(kdf.a.repeat(kdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index()) + self.assert_eq(psdf.a.repeat(psdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index()) def test_take(self): pser = pd.Series([100, 200, 300, 400, 500], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.take([0, 2, 4]).sort_values(), pser.take([0, 2, 4]).sort_values()) + self.assert_eq(psser.take([0, 2, 4]).sort_values(), pser.take([0, 2, 4]).sort_values()) self.assert_eq( - kser.take(range(0, 5, 2)).sort_values(), pser.take(range(0, 5, 2)).sort_values() + psser.take(range(0, 5, 2)).sort_values(), pser.take(range(0, 5, 2)).sort_values() ) - self.assert_eq(kser.take([-4, -2, 0]).sort_values(), pser.take([-4, -2, 0]).sort_values()) + self.assert_eq(psser.take([-4, -2, 0]).sort_values(), pser.take([-4, -2, 0]).sort_values()) self.assert_eq( - kser.take(range(-2, 1, 2)).sort_values(), pser.take(range(-2, 1, 2)).sort_values() + psser.take(range(-2, 1, 2)).sort_values(), pser.take(range(-2, 1, 2)).sort_values() ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kser.take(1)) - self.assertRaises(ValueError, lambda: kser.take("1")) - self.assertRaises(ValueError, lambda: kser.take({1, 2})) - self.assertRaises(ValueError, lambda: kser.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: psser.take(1)) + self.assertRaises(TypeError, lambda: psser.take("1")) + self.assertRaises(TypeError, lambda: psser.take({1, 2})) + self.assertRaises(TypeError, lambda: psser.take({1: None, 2: None})) def test_divmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - kdiv, kmod = kser.divmod(-100) + kdiv, kmod = psser.divmod(-100) pdiv, pmod = pser.divmod(-100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) - kdiv, kmod = kser.divmod(100) + kdiv, kmod = psser.divmod(100) pdiv, pmod = pser.divmod(100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - kdiv, kmod = kser.divmod(-100) + kdiv, kmod = psser.divmod(-100) pdiv, pmod = pser.floordiv(-100), pser.mod(-100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) - kdiv, kmod = kser.divmod(100) + kdiv, kmod = psser.divmod(100) pdiv, pmod = pser.floordiv(100), pser.mod(100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) def test_rdivmod(self): pser = pd.Series([100, None, 300, None, 500]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - krdiv, krmod = kser.rdivmod(-100) + krdiv, krmod = psser.rdivmod(-100) prdiv, prmod = pser.rdivmod(-100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) - krdiv, krmod = kser.rdivmod(100) + krdiv, krmod = psser.rdivmod(100) prdiv, prmod = pser.rdivmod(100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - krdiv, krmod = kser.rdivmod(-100) + krdiv, krmod = psser.rdivmod(-100) prdiv, prmod = pser.rfloordiv(-100), pser.rmod(-100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) - krdiv, krmod = kser.rdivmod(100) + krdiv, krmod = psser.rdivmod(100) prdiv, prmod = pser.rfloordiv(100), pser.rmod(100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) def test_mod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.mod(-150), pser.mod(-150)) - self.assert_eq(kser.mod(0), pser.mod(0)) - self.assert_eq(kser.mod(150), pser.mod(150)) + self.assert_eq(psser.mod(-150), pser.mod(-150)) + self.assert_eq(psser.mod(0), pser.mod(0)) + self.assert_eq(psser.mod(150), pser.mod(150)) pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6}) - kdf = ps.from_pandas(pdf) - self.assert_eq(kdf.a.mod(kdf.b), pdf.a.mod(pdf.b)) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.a.mod(psdf.b), pdf.a.mod(pdf.b)) def test_mode(self): pser = pd.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan]) - kser = ps.from_pandas(pser) - self.assert_eq(kser.mode(), pser.mode()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.mode(), pser.mode()) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `dropna` argument is added in pandas 0.24. self.assert_eq( - kser.mode(dropna=False).sort_values().reset_index(drop=True), + psser.mode(dropna=False).sort_values().reset_index(drop=True), pser.mode(dropna=False).sort_values().reset_index(drop=True), ) pser.name = "x" - kser = ps.from_pandas(pser) - self.assert_eq(kser.mode(), pser.mode()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.mode(), pser.mode()) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `dropna` argument is added in pandas 0.24. self.assert_eq( - kser.mode(dropna=False).sort_values().reset_index(drop=True), + psser.mode(dropna=False).sort_values().reset_index(drop=True), pser.mode(dropna=False).sort_values().reset_index(drop=True), ) def test_rmod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.rmod(-150), pser.rmod(-150)) - self.assert_eq(kser.rmod(0), pser.rmod(0)) - self.assert_eq(kser.rmod(150), pser.rmod(150)) + self.assert_eq(psser.rmod(-150), pser.rmod(-150)) + self.assert_eq(psser.rmod(0), pser.rmod(0)) + self.assert_eq(psser.rmod(150), pser.rmod(150)) pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6}) - kdf = ps.from_pandas(pdf) - self.assert_eq(kdf.a.rmod(kdf.b), pdf.a.rmod(pdf.b)) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.a.rmod(psdf.b), pdf.a.rmod(pdf.b)) def test_asof(self): pser = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.asof(20), pser.asof(20)) - self.assert_eq(kser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index()) - self.assert_eq(kser.asof(100), pser.asof(100)) - self.assert_eq(repr(kser.asof(-100)), repr(pser.asof(-100))) - self.assert_eq(kser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index()) + self.assert_eq(psser.asof(20), pser.asof(20)) + self.assert_eq(psser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index()) + self.assert_eq(psser.asof(100), pser.asof(100)) + self.assert_eq(repr(psser.asof(-100)), repr(pser.asof(-100))) + self.assert_eq(psser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index()) # where cannot be an Index, Series or a DataFrame - self.assertRaises(ValueError, lambda: kser.asof(ps.Index([-100, 100]))) - self.assertRaises(ValueError, lambda: kser.asof(ps.Series([-100, 100]))) - self.assertRaises(ValueError, lambda: kser.asof(ps.DataFrame({"A": [1, 2, 3]}))) + self.assertRaises(ValueError, lambda: psser.asof(ps.Index([-100, 100]))) + self.assertRaises(ValueError, lambda: psser.asof(ps.Series([-100, 100]))) + self.assertRaises(ValueError, lambda: psser.asof(ps.DataFrame({"A": [1, 2, 3]}))) # asof is not supported for a MultiIndex pser.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c"), ("y", "d")]) - kser = ps.from_pandas(pser) - self.assertRaises(ValueError, lambda: kser.asof(20)) + psser = ps.from_pandas(pser) + self.assertRaises(ValueError, lambda: psser.asof(20)) # asof requires a sorted index (More precisely, should be a monotonic increasing) - kser = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40], name="Koalas") - self.assertRaises(ValueError, lambda: kser.asof(20)) - kser = ps.Series([1, 2, np.nan, 4], index=[40, 30, 20, 10], name="Koalas") - self.assertRaises(ValueError, lambda: kser.asof(20)) + psser = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40], name="Koalas") + self.assertRaises(ValueError, lambda: psser.asof(20)) + psser = ps.Series([1, 2, np.nan, 4], index=[40, 30, 20, 10], name="Koalas") + self.assertRaises(ValueError, lambda: psser.asof(20)) pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"]) pser = pd.Series([1, 2, np.nan], index=pidx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.asof("2014-01-01"), pser.asof("2014-01-01")) - self.assert_eq(kser.asof("2014-01-02"), pser.asof("2014-01-02")) - self.assert_eq(repr(kser.asof("1999-01-02")), repr(pser.asof("1999-01-02"))) + self.assert_eq(psser.asof("2014-01-01"), pser.asof("2014-01-01")) + self.assert_eq(psser.asof("2014-01-02"), pser.asof("2014-01-02")) + self.assert_eq(repr(psser.asof("1999-01-02")), repr(pser.asof("1999-01-02"))) def test_squeeze(self): # Single value pser = pd.Series([90]) - kser = ps.from_pandas(pser) - self.assert_eq(kser.squeeze(), pser.squeeze()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.squeeze(), pser.squeeze()) # Single value with MultiIndex midx = pd.MultiIndex.from_tuples([("a", "b", "c")]) pser = pd.Series([90], index=midx) - kser = ps.from_pandas(pser) - self.assert_eq(kser.squeeze(), pser.squeeze()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.squeeze(), pser.squeeze()) # Multiple values pser = pd.Series([90, 91, 85]) - kser = ps.from_pandas(pser) - self.assert_eq(kser.squeeze(), pser.squeeze()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.squeeze(), pser.squeeze()) # Multiple values with MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series([90, 91, 85], index=midx) - kser = ps.from_pandas(pser) - self.assert_eq(kser.squeeze(), pser.squeeze()) + psser = ps.from_pandas(pser) + self.assert_eq(psser.squeeze(), pser.squeeze()) def test_swaplevel(self): # MultiIndex with two levels arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pser = pd.Series(["a", "b", "c", "d"], index=pidx) - kser = ps.from_pandas(pser) - self.assert_eq(pser.swaplevel(), kser.swaplevel()) - self.assert_eq(pser.swaplevel(0, 1), kser.swaplevel(0, 1)) - self.assert_eq(pser.swaplevel(1, 1), kser.swaplevel(1, 1)) - self.assert_eq(pser.swaplevel("number", "color"), kser.swaplevel("number", "color")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.swaplevel(), psser.swaplevel()) + self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1)) + self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1)) + self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color")) # MultiIndex with more than two levels arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"], ["l", "m", "s", "xs"]] pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color", "size")) pser = pd.Series(["a", "b", "c", "d"], index=pidx) - kser = ps.from_pandas(pser) - self.assert_eq(pser.swaplevel(), kser.swaplevel()) - self.assert_eq(pser.swaplevel(0, 1), kser.swaplevel(0, 1)) - self.assert_eq(pser.swaplevel(0, 2), kser.swaplevel(0, 2)) - self.assert_eq(pser.swaplevel(1, 2), kser.swaplevel(1, 2)) - self.assert_eq(pser.swaplevel(1, 1), kser.swaplevel(1, 1)) - self.assert_eq(pser.swaplevel(-1, -2), kser.swaplevel(-1, -2)) - self.assert_eq(pser.swaplevel("number", "color"), kser.swaplevel("number", "color")) - self.assert_eq(pser.swaplevel("number", "size"), kser.swaplevel("number", "size")) - self.assert_eq(pser.swaplevel("color", "size"), kser.swaplevel("color", "size")) + psser = ps.from_pandas(pser) + self.assert_eq(pser.swaplevel(), psser.swaplevel()) + self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1)) + self.assert_eq(pser.swaplevel(0, 2), psser.swaplevel(0, 2)) + self.assert_eq(pser.swaplevel(1, 2), psser.swaplevel(1, 2)) + self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1)) + self.assert_eq(pser.swaplevel(-1, -2), psser.swaplevel(-1, -2)) + self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color")) + self.assert_eq(pser.swaplevel("number", "size"), psser.swaplevel("number", "size")) + self.assert_eq(pser.swaplevel("color", "size"), psser.swaplevel("color", "size")) # Error conditions self.assertRaises(AssertionError, lambda: ps.Series([1, 2]).swaplevel()) - self.assertRaises(IndexError, lambda: kser.swaplevel(0, 9)) - self.assertRaises(KeyError, lambda: kser.swaplevel("not_number", "color")) - self.assertRaises(AssertionError, lambda: kser.swaplevel(copy=False)) + self.assertRaises(IndexError, lambda: psser.swaplevel(0, 9)) + self.assertRaises(KeyError, lambda: psser.swaplevel("not_number", "color")) + self.assertRaises(AssertionError, lambda: psser.swaplevel(copy=False)) def test_swapaxes(self): pser = pd.Series([1, 2, 3], index=["x", "y", "z"], name="ser") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(kser.swapaxes(0, 0), pser.swapaxes(0, 0)) - self.assert_eq(kser.swapaxes("index", "index"), pser.swapaxes("index", "index")) - self.assert_eq((kser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0)) + self.assert_eq(psser.swapaxes(0, 0), pser.swapaxes(0, 0)) + self.assert_eq(psser.swapaxes("index", "index"), pser.swapaxes("index", "index")) + self.assert_eq((psser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0)) - self.assertRaises(AssertionError, lambda: kser.swapaxes(0, 1, copy=False)) - self.assertRaises(ValueError, lambda: kser.swapaxes(0, 1)) - self.assertRaises(ValueError, lambda: kser.swapaxes("index", "columns")) + self.assertRaises(AssertionError, lambda: psser.swapaxes(0, 1, copy=False)) + self.assertRaises(ValueError, lambda: psser.swapaxes(0, 1)) + self.assertRaises(ValueError, lambda: psser.swapaxes("index", "columns")) def test_div_zero_and_nan(self): pser = pd.Series([100, None, -300, None, 500, -700, np.inf, -np.inf], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.div(0), kser.div(0)) - self.assert_eq(pser.truediv(0), kser.truediv(0)) - self.assert_eq(pser / 0, kser / 0) - self.assert_eq(pser.div(np.nan), kser.div(np.nan)) - self.assert_eq(pser.truediv(np.nan), kser.truediv(np.nan)) - self.assert_eq(pser / np.nan, kser / np.nan) + self.assert_eq(pser.div(0), psser.div(0)) + self.assert_eq(pser.truediv(0), psser.truediv(0)) + self.assert_eq(pser / 0, psser / 0) + self.assert_eq(pser.div(np.nan), psser.div(np.nan)) + self.assert_eq(pser.truediv(np.nan), psser.truediv(np.nan)) + self.assert_eq(pser / np.nan, psser / np.nan) # floordiv has different behavior in pandas > 1.0.0 when divide by 0 if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(pser.floordiv(0), kser.floordiv(0)) - self.assert_eq(pser // 0, kser // 0) + self.assert_eq(pser.floordiv(0), psser.floordiv(0)) + self.assert_eq(pser // 0, psser // 0) else: result = pd.Series( [np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf], name="Koalas" ) - self.assert_eq(kser.floordiv(0), result) - self.assert_eq(kser // 0, result) - self.assert_eq(pser.floordiv(np.nan), kser.floordiv(np.nan)) + self.assert_eq(psser.floordiv(0), result) + self.assert_eq(psser // 0, result) + self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan)) def test_mad(self): pser = pd.Series([1, 2, 3, 4], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), kser.mad()) + self.assert_eq(pser.mad(), psser.mad()) pser = pd.Series([None, -2, 5, 10, 50, np.nan, -20], name="Koalas") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), kser.mad()) + self.assert_eq(pser.mad(), psser.mad()) pmidx = pd.MultiIndex.from_tuples( [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] ) pser = pd.Series([1, 2, 3, 4, 5], name="Koalas") pser.index = pmidx - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), kser.mad()) + self.assert_eq(pser.mad(), psser.mad()) pmidx = pd.MultiIndex.from_tuples( [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] ) pser = pd.Series([None, -2, 5, 50, np.nan], name="Koalas") pser.index = pmidx - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), kser.mad()) + self.assert_eq(pser.mad(), psser.mad()) def test_to_frame(self): pser = pd.Series(["a", "b", "c"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.to_frame(name="a"), kser.to_frame(name="a")) + self.assert_eq(pser.to_frame(name="a"), psser.to_frame(name="a")) # for MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series(["a", "b", "c"], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.to_frame(name="a"), kser.to_frame(name="a")) + self.assert_eq(pser.to_frame(name="a"), psser.to_frame(name="a")) def test_shape(self): pser = pd.Series(["a", "b", "c"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.shape, kser.shape) + self.assert_eq(pser.shape, psser.shape) # for MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series(["a", "b", "c"], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.shape, kser.shape) + self.assert_eq(pser.shape, psser.shape) @unittest.skipIf(not have_tabulate, tabulate_requirement_message) def test_to_markdown(self): pser = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) # `to_markdown()` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0. if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assertRaises(NotImplementedError, lambda: kser.to_markdown()) + self.assertRaises(NotImplementedError, lambda: psser.to_markdown()) else: - self.assert_eq(pser.to_markdown(), kser.to_markdown()) + self.assert_eq(pser.to_markdown(), psser.to_markdown()) def test_unstack(self): pser = pd.Series( @@ -2200,112 +2209,112 @@ def test_unstack(self): names=["A", "B", "C"], ), ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) levels = [-3, -2, -1, 0, 1, 2] for level in levels: pandas_result = pser.unstack(level=level) - koalas_result = kser.unstack(level=level).sort_index() - self.assert_eq(pandas_result, koalas_result) - self.assert_eq(pandas_result.index.names, koalas_result.index.names) - self.assert_eq(pandas_result.columns.names, koalas_result.columns.names) + pandas_on_spark_result = psser.unstack(level=level).sort_index() + self.assert_eq(pandas_result, pandas_on_spark_result) + self.assert_eq(pandas_result.index.names, pandas_on_spark_result.index.names) + self.assert_eq(pandas_result.columns.names, pandas_on_spark_result.columns.names) # non-numeric datatypes pser = pd.Series( list("abcd"), index=pd.MultiIndex.from_product([["one", "two"], ["a", "b"]]) ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) levels = [-2, -1, 0, 1] for level in levels: pandas_result = pser.unstack(level=level) - koalas_result = kser.unstack(level=level).sort_index() - self.assert_eq(pandas_result, koalas_result) - self.assert_eq(pandas_result.index.names, koalas_result.index.names) - self.assert_eq(pandas_result.columns.names, koalas_result.columns.names) + pandas_on_spark_result = psser.unstack(level=level).sort_index() + self.assert_eq(pandas_result, pandas_on_spark_result) + self.assert_eq(pandas_result.index.names, pandas_on_spark_result.index.names) + self.assert_eq(pandas_result.columns.names, pandas_on_spark_result.columns.names) # Exceeding the range of level - self.assertRaises(IndexError, lambda: kser.unstack(level=3)) - self.assertRaises(IndexError, lambda: kser.unstack(level=-4)) + self.assertRaises(IndexError, lambda: psser.unstack(level=3)) + self.assertRaises(IndexError, lambda: psser.unstack(level=-4)) # Only support for MultiIndex - kser = ps.Series([10, -2, 4, 7]) - self.assertRaises(ValueError, lambda: kser.unstack()) + psser = ps.Series([10, -2, 4, 7]) + self.assertRaises(ValueError, lambda: psser.unstack()) def test_item(self): - kser = ps.Series([10, 20]) - self.assertRaises(ValueError, lambda: kser.item()) + psser = ps.Series([10, 20]) + self.assertRaises(ValueError, lambda: psser.item()) def test_filter(self): pser = pd.Series([0, 1, 2], index=["one", "two", "three"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.filter(items=["one", "three"]), kser.filter(items=["one", "three"])) - self.assert_eq(pser.filter(regex="e$"), kser.filter(regex="e$")) - self.assert_eq(pser.filter(like="hre"), kser.filter(like="hre")) + self.assert_eq(pser.filter(items=["one", "three"]), psser.filter(items=["one", "three"])) + self.assert_eq(pser.filter(regex="e$"), psser.filter(regex="e$")) + self.assert_eq(pser.filter(like="hre"), psser.filter(like="hre")) with self.assertRaisesRegex(ValueError, "Series does not support columns axis."): - kser.filter(like="hre", axis=1) + psser.filter(like="hre", axis=1) # for MultiIndex midx = pd.MultiIndex.from_tuples([("one", "x"), ("two", "y"), ("three", "z")]) pser = pd.Series([0, 1, 2], index=midx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pser.filter(items=[("one", "x"), ("three", "z")]), - kser.filter(items=[("one", "x"), ("three", "z")]), + psser.filter(items=[("one", "x"), ("three", "z")]), ) with self.assertRaisesRegex(TypeError, "Unsupported type list"): - kser.filter(items=[["one", "x"], ("three", "z")]) + psser.filter(items=[["one", "x"], ("three", "z")]) with self.assertRaisesRegex(ValueError, "The item should not be empty."): - kser.filter(items=[(), ("three", "z")]) + psser.filter(items=[(), ("three", "z")]) def test_abs(self): pser = pd.Series([-2, -1, 0, 1]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(abs(kser), abs(pser)) - self.assert_eq(np.abs(kser), np.abs(pser)) + self.assert_eq(abs(psser), abs(pser)) + self.assert_eq(np.abs(psser), np.abs(pser)) def test_bfill(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.bfill(), pser.bfill()) - self.assert_eq(kser.bfill()[0], pser.bfill()[0]) + self.assert_eq(psser.bfill(), pser.bfill()) + self.assert_eq(psser.bfill()[0], pser.bfill()[0]) - kser.bfill(inplace=True) + psser.bfill(inplace=True) pser.bfill(inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kser[0], pser[0]) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psser[0], pser[0]) + self.assert_eq(psdf, pdf) def test_ffill(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) pser = pdf.x - kser = kdf.x + psser = psdf.x - self.assert_eq(kser.ffill(), pser.ffill()) - self.assert_eq(kser.ffill()[4], pser.ffill()[4]) + self.assert_eq(psser.ffill(), pser.ffill()) + self.assert_eq(psser.ffill()[4], pser.ffill()[4]) - kser.ffill(inplace=True) + psser.ffill(inplace=True) pser.ffill(inplace=True) - self.assert_eq(kser, pser) - self.assert_eq(kser[4], pser[4]) - self.assert_eq(kdf, pdf) + self.assert_eq(psser, pser) + self.assert_eq(psser[4], pser[4]) + self.assert_eq(psdf, pdf) def test_iteritems(self): pser = pd.Series(["A", "B", "C"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - for (p_name, p_items), (k_name, k_items) in zip(pser.iteritems(), kser.iteritems()): + for (p_name, p_items), (k_name, k_items) in zip(pser.iteritems(), psser.iteritems()): self.assert_eq(p_name, k_name) self.assert_eq(p_items, k_items) @@ -2319,136 +2328,136 @@ def test_droplevel(self): names=["level_1", "level_2", "level_3"], ), ) - kser = ps.from_pandas(pser) - - self.assert_eq(pser.droplevel(0), kser.droplevel(0)) - self.assert_eq(pser.droplevel("level_1"), kser.droplevel("level_1")) - self.assert_eq(pser.droplevel(-1), kser.droplevel(-1)) - self.assert_eq(pser.droplevel([0]), kser.droplevel([0])) - self.assert_eq(pser.droplevel(["level_1"]), kser.droplevel(["level_1"])) - self.assert_eq(pser.droplevel((0,)), kser.droplevel((0,))) - self.assert_eq(pser.droplevel(("level_1",)), kser.droplevel(("level_1",))) - self.assert_eq(pser.droplevel([0, 2]), kser.droplevel([0, 2])) + psser = ps.from_pandas(pser) + + self.assert_eq(pser.droplevel(0), psser.droplevel(0)) + self.assert_eq(pser.droplevel("level_1"), psser.droplevel("level_1")) + self.assert_eq(pser.droplevel(-1), psser.droplevel(-1)) + self.assert_eq(pser.droplevel([0]), psser.droplevel([0])) + self.assert_eq(pser.droplevel(["level_1"]), psser.droplevel(["level_1"])) + self.assert_eq(pser.droplevel((0,)), psser.droplevel((0,))) + self.assert_eq(pser.droplevel(("level_1",)), psser.droplevel(("level_1",))) + self.assert_eq(pser.droplevel([0, 2]), psser.droplevel([0, 2])) self.assert_eq( - pser.droplevel(["level_1", "level_3"]), kser.droplevel(["level_1", "level_3"]) + pser.droplevel(["level_1", "level_3"]), psser.droplevel(["level_1", "level_3"]) ) - self.assert_eq(pser.droplevel((1, 2)), kser.droplevel((1, 2))) + self.assert_eq(pser.droplevel((1, 2)), psser.droplevel((1, 2))) self.assert_eq( - pser.droplevel(("level_2", "level_3")), kser.droplevel(("level_2", "level_3")) + pser.droplevel(("level_2", "level_3")), psser.droplevel(("level_2", "level_3")) ) with self.assertRaisesRegex(KeyError, "Level {0, 1, 2} not found"): - kser.droplevel({0, 1, 2}) + psser.droplevel({0, 1, 2}) with self.assertRaisesRegex(KeyError, "Level level_100 not found"): - kser.droplevel(["level_1", "level_100"]) + psser.droplevel(["level_1", "level_100"]) with self.assertRaisesRegex( IndexError, "Too many levels: Index has only 3 levels, not 11" ): - kser.droplevel(10) + psser.droplevel(10) with self.assertRaisesRegex( IndexError, "Too many levels: Index has only 3 levels, -10 is not a valid level number", ): - kser.droplevel(-10) + psser.droplevel(-10) with self.assertRaisesRegex( ValueError, "Cannot remove 3 levels from an index with 3 levels: " "at least one level must be left.", ): - kser.droplevel([0, 1, 2]) + psser.droplevel([0, 1, 2]) with self.assertRaisesRegex( ValueError, "Cannot remove 5 levels from an index with 3 levels: " "at least one level must be left.", ): - kser.droplevel([1, 1, 1, 1, 1]) + psser.droplevel([1, 1, 1, 1, 1]) # Tupled names pser.index.names = [("a", "1"), ("b", "2"), ("c", "3")] - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.droplevel([("a", "1"), ("c", "3")]), kser.droplevel([("a", "1"), ("c", "3")]) + pser.droplevel([("a", "1"), ("c", "3")]), psser.droplevel([("a", "1"), ("c", "3")]) ) def test_dot(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq((kdf["b"] * 10).dot(kdf["a"]), (pdf["b"] * 10).dot(pdf["a"])) - self.assert_eq((kdf["b"] * 10).dot(kdf), (pdf["b"] * 10).dot(pdf)) - self.assert_eq((kdf["b"] * 10).dot(kdf + 1), (pdf["b"] * 10).dot(pdf + 1)) + self.assert_eq((psdf["b"] * 10).dot(psdf["a"]), (pdf["b"] * 10).dot(pdf["a"])) + self.assert_eq((psdf["b"] * 10).dot(psdf), (pdf["b"] * 10).dot(pdf)) + self.assert_eq((psdf["b"] * 10).dot(psdf + 1), (pdf["b"] * 10).dot(pdf + 1)) def test_tail(self): pser = pd.Series(range(1000), name="Koalas") - kser = ps.from_pandas(pser) - - self.assert_eq(pser.tail(), kser.tail()) - self.assert_eq(pser.tail(10), kser.tail(10)) - self.assert_eq(pser.tail(-990), kser.tail(-990)) - self.assert_eq(pser.tail(0), kser.tail(0)) - self.assert_eq(pser.tail(1001), kser.tail(1001)) - self.assert_eq(pser.tail(-1001), kser.tail(-1001)) - self.assert_eq((pser + 1).tail(), (kser + 1).tail()) - self.assert_eq((pser + 1).tail(10), (kser + 1).tail(10)) - self.assert_eq((pser + 1).tail(-990), (kser + 1).tail(-990)) - self.assert_eq((pser + 1).tail(0), (kser + 1).tail(0)) - self.assert_eq((pser + 1).tail(1001), (kser + 1).tail(1001)) - self.assert_eq((pser + 1).tail(-1001), (kser + 1).tail(-1001)) + psser = ps.from_pandas(pser) + + self.assert_eq(pser.tail(), psser.tail()) + self.assert_eq(pser.tail(10), psser.tail(10)) + self.assert_eq(pser.tail(-990), psser.tail(-990)) + self.assert_eq(pser.tail(0), psser.tail(0)) + self.assert_eq(pser.tail(1001), psser.tail(1001)) + self.assert_eq(pser.tail(-1001), psser.tail(-1001)) + self.assert_eq((pser + 1).tail(), (psser + 1).tail()) + self.assert_eq((pser + 1).tail(10), (psser + 1).tail(10)) + self.assert_eq((pser + 1).tail(-990), (psser + 1).tail(-990)) + self.assert_eq((pser + 1).tail(0), (psser + 1).tail(0)) + self.assert_eq((pser + 1).tail(1001), (psser + 1).tail(1001)) + self.assert_eq((pser + 1).tail(-1001), (psser + 1).tail(-1001)) with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): - kser.tail("10") + psser.tail("10") def test_product(self): pser = pd.Series([10, 20, 30, 40, 50]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) # Containing NA values pser = pd.Series([10, np.nan, 30, np.nan, 50]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod(), almost=True) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod(), almost=True) # All-NA values pser = pd.Series([np.nan, np.nan, np.nan]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) # Empty Series pser = pd.Series([]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) # Boolean Series pser = pd.Series([True, True, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) pser = pd.Series([False, False, False]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) pser = pd.Series([True, False, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), kser.prod()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), psser.prod()) # With `min_count` parameter pser = pd.Series([10, 20, 30, 40, 50]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=5), kser.prod(min_count=5)) - self.assert_eq(pser.prod(min_count=6), kser.prod(min_count=6)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=5), psser.prod(min_count=5)) + self.assert_eq(pser.prod(min_count=6), psser.prod(min_count=6)) pser = pd.Series([10, np.nan, 30, np.nan, 50]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=3), kser.prod(min_count=3), almost=True) - self.assert_eq(pser.prod(min_count=4), kser.prod(min_count=4)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=3), psser.prod(min_count=3), almost=True) + self.assert_eq(pser.prod(min_count=4), psser.prod(min_count=4)) pser = pd.Series([np.nan, np.nan, np.nan]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) pser = pd.Series([]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1)) + psser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ps.Series(["a", "b", "c"]).prod() @@ -2460,26 +2469,26 @@ def test_product(self): def test_hasnans(self): # BooleanType pser = pd.Series([True, False, True, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) pser = pd.Series([True, False, np.nan, True]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) # TimestampType pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, kser.hasnans) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) def test_last_valid_index(self): pser = pd.Series([250, 1.5, 320, 1, 0.3, None, None, None, None]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) # MultiIndex columns midx = pd.MultiIndex( @@ -2487,48 +2496,48 @@ def test_last_valid_index(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser.index = midx - kser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) # Empty Series pser = pd.Series([]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) def test_first_valid_index(self): # Empty Series pser = pd.Series([]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.first_valid_index(), kser.first_valid_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.first_valid_index(), psser.first_valid_index()) def test_factorize(self): pser = pd.Series(["a", "b", "a", "b"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([5, 1, 5, 1]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = (pser + 1).factorize(sort=True) - kcodes, kuniques = (kser + 1).factorize() + kcodes, kuniques = (psser + 1).factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series(["a", "b", "a", "b"], name="ser", index=["w", "x", "y", "z"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series( ["a", "b", "a", "b"], index=pd.MultiIndex.from_arrays([[4, 3, 2, 1], [1, 2, 3, 4]]) ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) @@ -2536,38 +2545,38 @@ def test_factorize(self): # Deals with None and np.nan # pser = pd.Series(["a", "b", "a", np.nan]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([1, None, 3, 2, 1]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series(["a", None, "a"]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([None, np.nan]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize() - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes, kcodes.to_list()) # pandas: Float64Index([], dtype='float64') self.assert_eq(pd.Index([]), kuniques) pser = pd.Series([np.nan, np.nan]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize() - kcodes, kuniques = kser.factorize() + kcodes, kuniques = psser.factorize() self.assert_eq(pcodes, kcodes.to_list()) # pandas: Float64Index([], dtype='float64') self.assert_eq(pd.Index([]), kuniques) @@ -2582,75 +2591,75 @@ def test_factorize(self): pd_below_0_24 = LooseVersion(pd.__version__) < LooseVersion("0.24") pser = pd.Series(["a", "b", "a", np.nan, None]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2) - kcodes, kuniques = kser.factorize(na_sentinel=-2) + kcodes, kuniques = psser.factorize(na_sentinel=-2) self.assert_eq([0, 1, 0, -2, -2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pcodes, puniques = pser.factorize(sort=True, na_sentinel=2) - kcodes, kuniques = kser.factorize(na_sentinel=2) + kcodes, kuniques = psser.factorize(na_sentinel=2) self.assert_eq([0, 1, 0, 2, 2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) if not pd_below_1_1_2: pcodes, puniques = pser.factorize(sort=True, na_sentinel=None) - kcodes, kuniques = kser.factorize(na_sentinel=None) + kcodes, kuniques = psser.factorize(na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) # puniques is Index(['a', 'b', nan], dtype='object') self.assert_eq(ps.Index(["a", "b", None]), kuniques) - kser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null - kser.loc[3] = np.nan # Spark takes np.nan as NaN - kcodes, kuniques = kser.factorize(na_sentinel=None) - pcodes, puniques = kser.to_pandas().factorize(sort=True, na_sentinel=None) + psser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null + psser.loc[3] = np.nan # Spark takes np.nan as NaN + kcodes, kuniques = psser.factorize(na_sentinel=None) + pcodes, puniques = psser.to_pandas().factorize(sort=True, na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) def test_pad(self): pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(pser.pad(), kser.pad()) + self.assert_eq(pser.pad(), psser.pad()) # Test `inplace=True` pser.pad(inplace=True) - kser.pad(inplace=True) - self.assert_eq(pser, kser) + psser.pad(inplace=True) + self.assert_eq(pser, psser) else: expected = ps.Series([np.nan, 2, 3, 4, 4, 6], name="x") - self.assert_eq(expected, kser.pad()) + self.assert_eq(expected, psser.pad()) # Test `inplace=True` - kser.pad(inplace=True) - self.assert_eq(expected, kser) + psser.pad(inplace=True) + self.assert_eq(expected, psser) def test_explode(self): if LooseVersion(pd.__version__) >= LooseVersion("0.25"): pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), kser.explode(), almost=True) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode(), almost=True) # MultiIndex pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), kser.explode(), almost=True) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode(), almost=True) # non-array type Series pser = pd.Series([1, 2, 3, 4]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), kser.explode()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode()) else: pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) expected = pd.Series([1.0, 2.0, 3.0, None, None, 3.0, 4.0], index=[0, 0, 0, 1, 2, 3, 3]) - self.assert_eq(kser.explode(), expected) + self.assert_eq(psser.explode(), expected) # MultiIndex pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) expected = pd.Series( [1.0, 2.0, 3.0, None, None, 3.0, 4.0], index=pd.MultiIndex.from_tuples( @@ -2665,106 +2674,106 @@ def test_explode(self): ] ), ) - self.assert_eq(kser.explode(), expected) + self.assert_eq(psser.explode(), expected) # non-array type Series pser = pd.Series([1, 2, 3, 4]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) expected = pser - self.assert_eq(kser.explode(), expected) + self.assert_eq(psser.explode(), expected) def test_argsort(self): # Without null values pser = pd.Series([0, -100, 50, 100, 20], index=["A", "B", "C", "D", "E"]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # With name pser.name = "Koalas" - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # Series from Index pidx = pd.Index([4.0, -6.0, 2.0, -100.0, 11.0, 20.0, 1.0, -99.0]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() ) # Series from Index with name pidx.name = "Koalas" - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() ) # Series from DataFrame pdf = pd.DataFrame({"A": [4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]}) - kdf = ps.from_pandas(pdf) - self.assert_eq(pdf.A.argsort().sort_index(), kdf.A.argsort().sort_index()) - self.assert_eq((-pdf.A).argsort().sort_index(), (-kdf.A).argsort().sort_index()) + psdf = ps.from_pandas(pdf) + self.assert_eq(pdf.A.argsort().sort_index(), psdf.A.argsort().sort_index()) + self.assert_eq((-pdf.A).argsort().sort_index(), (-psdf.A).argsort().sort_index()) # With null values pser = pd.Series([0, -100, np.nan, 100, np.nan], index=["A", "B", "C", "D", "E"]) - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # MultiIndex with null values pser.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # With name with null values pser.name = "Koalas" - kser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) # Series from Index with null values pidx = pd.Index([4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]) - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() ) # Series from Index with name with null values pidx.name = "Koalas" - kidx = ps.from_pandas(pidx) + psidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() ) # Series from DataFrame with null values pdf = pd.DataFrame({"A": [4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]}) - kdf = ps.from_pandas(pdf) - self.assert_eq(pdf.A.argsort().sort_index(), kdf.A.argsort().sort_index()) - self.assert_eq((-pdf.A).argsort().sort_index(), (-kdf.A).argsort().sort_index()) + psdf = ps.from_pandas(pdf) + self.assert_eq(pdf.A.argsort().sort_index(), psdf.A.argsort().sort_index()) + self.assert_eq((-pdf.A).argsort().sort_index(), (-psdf.A).argsort().sort_index()) def test_argmin_argmax(self): pser = pd.Series( @@ -2778,34 +2787,34 @@ def test_argmin_argmax(self): }, name="Koalas", ) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(pser.argmin(), kser.argmin()) - self.assert_eq(pser.argmax(), kser.argmax()) + self.assert_eq(pser.argmin(), psser.argmin()) + self.assert_eq(pser.argmax(), psser.argmax()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.argmin(), kser.argmin()) - self.assert_eq(pser.argmax(), kser.argmax()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argmin(), psser.argmin()) + self.assert_eq(pser.argmax(), psser.argmax()) # Null Series self.assert_eq(pd.Series([np.nan]).argmin(), ps.Series([np.nan]).argmin()) self.assert_eq(pd.Series([np.nan]).argmax(), ps.Series([np.nan]).argmax()) else: - self.assert_eq(pser.values.argmin(), kser.argmin()) - self.assert_eq(pser.values.argmax(), kser.argmax()) + self.assert_eq(pser.values.argmin(), psser.argmin()) + self.assert_eq(pser.values.argmax(), psser.argmax()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] ) - kser = ps.from_pandas(pser) - self.assert_eq(pser.values.argmin(), kser.argmin()) - self.assert_eq(pser.values.argmax(), kser.argmax()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.values.argmin(), psser.argmin()) + self.assert_eq(pser.values.argmax(), psser.argmax()) # Null Series self.assert_eq(-1, ps.Series([np.nan]).argmin()) @@ -2818,91 +2827,91 @@ def test_argmin_argmax(self): def test_backfill(self): pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(pser.backfill(), kser.backfill()) + self.assert_eq(pser.backfill(), psser.backfill()) # Test `inplace=True` pser.backfill(inplace=True) - kser.backfill(inplace=True) - self.assert_eq(pser, kser) + psser.backfill(inplace=True) + self.assert_eq(pser, psser) else: expected = ps.Series([2.0, 2.0, 3.0, 4.0, 6.0, 6.0], name="x") - self.assert_eq(expected, kser.backfill()) + self.assert_eq(expected, psser.backfill()) # Test `inplace=True` - kser.backfill(inplace=True) - self.assert_eq(expected, kser) + psser.backfill(inplace=True) + self.assert_eq(expected, psser) def test_align(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - kdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) for join in ["outer", "inner", "left", "right"]: for axis in [None, 0]: - kser_l, kser_r = kdf.a.align(kdf.b, join=join, axis=axis) + psser_l, psser_r = psdf.a.align(psdf.b, join=join, axis=axis) pser_l, pser_r = pdf.a.align(pdf.b, join=join, axis=axis) - self.assert_eq(kser_l, pser_l) - self.assert_eq(kser_r, pser_r) + self.assert_eq(psser_l, pser_l) + self.assert_eq(psser_r, pser_r) - kser_l, kdf_r = kdf.b.align(kdf[["b", "a"]], join=join, axis=axis) + psser_l, psdf_r = psdf.b.align(psdf[["b", "a"]], join=join, axis=axis) pser_l, pdf_r = pdf.b.align(pdf[["b", "a"]], join=join, axis=axis) - self.assert_eq(kser_l, pser_l) - self.assert_eq(kdf_r, pdf_r) + self.assert_eq(psser_l, pser_l) + self.assert_eq(psdf_r, pdf_r) - self.assertRaises(ValueError, lambda: kdf.a.align(kdf.b, axis=1)) + self.assertRaises(ValueError, lambda: psdf.a.align(psdf.b, axis=1)) def test_pow_and_rpow(self): pser = pd.Series([1, 2, np.nan]) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) - self.assert_eq(pser.pow(np.nan), kser.pow(np.nan)) - self.assert_eq(pser ** np.nan, kser ** np.nan) - self.assert_eq(pser.rpow(np.nan), kser.rpow(np.nan)) - self.assert_eq(1 ** pser, 1 ** kser) + self.assert_eq(pser.pow(np.nan), psser.pow(np.nan)) + self.assert_eq(pser ** np.nan, psser ** np.nan) + self.assert_eq(pser.rpow(np.nan), psser.rpow(np.nan)) + self.assert_eq(1 ** pser, 1 ** psser) def test_between_time(self): idx = pd.date_range("2018-04-09", periods=4, freq="1D20min") pser = pd.Series([1, 2, 3, 4], index=idx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - kser.between_time("0:15", "0:45").sort_index(), + psser.between_time("0:15", "0:45").sort_index(), ) pser.index.name = "ts" - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - kser.between_time("0:15", "0:45").sort_index(), + psser.between_time("0:15", "0:45").sort_index(), ) pser.index.name = "index" - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - kser.between_time("0:15", "0:45").sort_index(), + psser.between_time("0:15", "0:45").sort_index(), ) def test_at_time(self): idx = pd.date_range("2018-04-09", periods=4, freq="1D20min") pser = pd.Series([1, 2, 3, 4], index=idx) - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), ) pser.index.name = "ts" - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), ) pser.index.name = "index" - kser = ps.from_pandas(pser) + psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), ) @@ -2911,7 +2920,8 @@ def test_at_time(self): try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) From 89eb7229bb050fae5d76d1173e54fdff113678db Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:33:43 -0700 Subject: [PATCH 20/30] change # of predicate --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index d29c6253ab746..ff7e2d8bc880d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -45,7 +45,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df1 = spark.range(N).map(_ => Random.nextInt) - private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF + private val df2 = Seq.fill(1000 * 1000) {UUID.randomUUID().toString.replace("-", "")}.toDF private def writeORCBenchmark(): Unit = { withTempPath { dir => @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -96,7 +96,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") runBenchmark(s"ORC Read for IN set") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where(filter).noop() } @@ -161,7 +161,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") @@ -171,7 +171,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .parquet(path + "/withBF") runBenchmark(s"Parquet Read for IN set") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) .parquet(path + "/withoutBF").where(filter).noop() From 2d089ff51b55fe08a2ea5d4db3ddd667a1a14010 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:38:20 -0700 Subject: [PATCH 21/30] Revert "change # of predicate" This reverts commit 89eb7229bb050fae5d76d1173e54fdff113678db. --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index ff7e2d8bc880d..d29c6253ab746 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -45,7 +45,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df1 = spark.range(N).map(_ => Random.nextInt) - private val df2 = Seq.fill(1000 * 1000) {UUID.randomUUID().toString.replace("-", "")}.toDF + private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF private def writeORCBenchmark(): Unit = { withTempPath { dir => @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -96,7 +96,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") runBenchmark(s"ORC Read for IN set") { - val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where(filter).noop() } @@ -161,7 +161,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") @@ -171,7 +171,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .parquet(path + "/withBF") runBenchmark(s"Parquet Read for IN set") { - val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) .parquet(path + "/withoutBF").where(filter).noop() From 02200239e3fbcd53798a721f4ec7516bc74aa63f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:52:10 -0700 Subject: [PATCH 22/30] Revert "[SPARK-35098][PYTHON] Re-enable pandas-on-Spark test cases" This reverts commit b9e4faf2ed6b1d7e79af07ae3409e9a50f6844c2. --- .../pyspark/pandas/tests/indexes/test_base.py | 1546 ++++++------ python/pyspark/pandas/tests/test_series.py | 2110 ++++++++--------- 2 files changed, 1816 insertions(+), 1840 deletions(-) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index b6d4182825ee6..a0eb243a6c56a 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -43,7 +43,7 @@ def pdf(self): ) @property - def psdf(self): + def kdf(self): return ps.from_pandas(self.pdf) def test_index_basic(self): @@ -61,55 +61,55 @@ def test_index_basic(self): pd.DataFrame(np.random.randn(10, 5), index=pd.Categorical(list("abcdefghij"))), pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")).set_index(["a", "b"]), ]: - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.index, pdf.index) - self.assert_eq(type(psdf.index).__name__, type(pdf.index).__name__) + kdf = ps.from_pandas(pdf) + self.assert_eq(kdf.index, pdf.index) + self.assert_eq(type(kdf.index).__name__, type(pdf.index).__name__) def test_index_from_series(self): pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(ps.Index(psser), pd.Index(pser)) - self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float")) - self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x")) + self.assert_eq(ps.Index(kser), pd.Index(pser)) + self.assert_eq(ps.Index(kser, dtype="float"), pd.Index(pser, dtype="float")) + self.assert_eq(ps.Index(kser, name="x"), pd.Index(pser, name="x")) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser)) - self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser)) + self.assert_eq(ps.Int64Index(kser), pd.Int64Index(pser)) + self.assert_eq(ps.Float64Index(kser), pd.Float64Index(pser)) else: - self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser).rename("a")) - self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser).rename("a")) + self.assert_eq(ps.Int64Index(kser), pd.Int64Index(pser).rename("a")) + self.assert_eq(ps.Float64Index(kser), pd.Float64Index(pser).rename("a")) pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(ps.Index(psser), pd.Index(pser)) - self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser)) + self.assert_eq(ps.Index(kser), pd.Index(pser)) + self.assert_eq(ps.DatetimeIndex(kser), pd.DatetimeIndex(pser)) def test_index_from_index(self): pidx = pd.Index([1, 2, 3], name="a") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(ps.Index(psidx), pd.Index(pidx)) - self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float")) - self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x")) + self.assert_eq(ps.Index(kidx), pd.Index(pidx)) + self.assert_eq(ps.Index(kidx, dtype="float"), pd.Index(pidx, dtype="float")) + self.assert_eq(ps.Index(kidx, name="x"), pd.Index(pidx, name="x")) - self.assert_eq(ps.Int64Index(psidx), pd.Int64Index(pidx)) - self.assert_eq(ps.Float64Index(psidx), pd.Float64Index(pidx)) + self.assert_eq(ps.Int64Index(kidx), pd.Int64Index(pidx)) + self.assert_eq(ps.Float64Index(kidx), pd.Float64Index(pidx)) pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(ps.Index(psidx), pd.Index(pidx)) - self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx)) + self.assert_eq(ps.Index(kidx), pd.Index(pidx)) + self.assert_eq(ps.DatetimeIndex(kidx), pd.DatetimeIndex(pidx)) def test_index_getattr(self): - psidx = self.psdf.index + kidx = self.kdf.index item = "databricks" expected_error_message = "'.*Index' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): - psidx.__getattr__(item) + kidx.__getattr__(item) with self.assertRaisesRegex(AttributeError, expected_error_message): ps.from_pandas(pd.date_range("2011-01-01", freq="D", periods=10)).__getattr__(item) @@ -117,148 +117,148 @@ def test_multi_index_getattr(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - psdf = ps.from_pandas(pdf) - psidx = psdf.index + kdf = ps.from_pandas(pdf) + kidx = kdf.index item = "databricks" expected_error_message = "'MultiIndex' object has no attribute '{}'".format(item) with self.assertRaisesRegex(AttributeError, expected_error_message): - psidx.__getattr__(item) + kidx.__getattr__(item) def test_to_series(self): pidx = self.pdf.index - psidx = self.psdf.index + kidx = self.kdf.index - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + self.assert_eq(kidx.to_series(), pidx.to_series()) + self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) # With name pidx.name = "Koalas" - psidx.name = "Koalas" - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) + kidx.name = "Koalas" + self.assert_eq(kidx.to_series(), pidx.to_series()) + self.assert_eq(kidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) # With tupled name pidx.name = ("x", "a") - psidx.name = ("x", "a") - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + kidx.name = ("x", "a") + self.assert_eq(kidx.to_series(), pidx.to_series()) + self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) - self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series()) + self.assert_eq((kidx + 1).to_series(), (pidx + 1).to_series()) pidx = self.pdf.set_index("b", append=True).index - psidx = self.psdf.set_index("b", append=True).index + kidx = self.kdf.set_index("b", append=True).index with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + self.assert_eq(kidx.to_series(), pidx.to_series()) + self.assert_eq(kidx.to_series(name="a"), pidx.to_series(name="a")) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - psidx.to_series(name=["x", "a"]) + kidx.to_series(name=["x", "a"]) def test_to_frame(self): pidx = self.pdf.index - psidx = self.psdf.index + kidx = self.kdf.index - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(kidx.to_frame(), pidx.to_frame()) + self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) pidx.name = "a" - psidx.name = "a" + kidx.name = "a" - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(kidx.to_frame(), pidx.to_frame()) + self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) + self.assert_eq(kidx.to_frame(name="x"), pidx.to_frame(name="x")) self.assert_eq( - psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"), + kidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"), ) - self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: kidx.to_frame(name=["x"])) # non-string name - self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + self.assert_eq(kidx.to_frame(name=10), pidx.to_frame(name=10)) + self.assert_eq(kidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) pidx = self.pdf.set_index("b", append=True).index - psidx = self.psdf.set_index("b", append=True).index + kidx = self.kdf.set_index("b", append=True).index - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + self.assert_eq(kidx.to_frame(), pidx.to_frame()) + self.assert_eq(kidx.to_frame(index=False), pidx.to_frame(index=False)) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. - self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) - self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) + self.assert_eq(kidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) + self.assert_eq(kidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) self.assert_eq( - psidx.to_frame(index=False, name=["x", "y"]), + kidx.to_frame(index=False, name=["x", "y"]), pidx.to_frame(index=False, name=["x", "y"]), ) - self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) - self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: kidx.to_frame(name="x")) + self.assertRaises(ValueError, lambda: kidx.to_frame(name=["x"])) # non-string names - self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + self.assert_eq(kidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) + self.assert_eq(kidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) self.assert_eq( - psidx.to_frame(name=[("x", 10), ("y", 20)]), + kidx.to_frame(name=[("x", 10), ("y", 20)]), pidx.to_frame(name=[("x", 10), ("y", 20)]), ) def test_index_names(self): - psdf = self.psdf - self.assertIsNone(psdf.index.name) + kdf = self.kdf + self.assertIsNone(kdf.index.name) idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") pdf = pd.DataFrame(np.random.randn(10, 5), index=idx, columns=list("abcde")) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.a - psser = psdf.a + kser = kdf.a - self.assertEqual(psdf.index.name, pdf.index.name) - self.assertEqual(psdf.index.names, pdf.index.names) + self.assertEqual(kdf.index.name, pdf.index.name) + self.assertEqual(kdf.index.names, pdf.index.names) pidx = pdf.index - psidx = psdf.index + kidx = kdf.index pidx.name = "renamed" - psidx.name = "renamed" - self.assertEqual(psidx.name, pidx.name) - self.assertEqual(psidx.names, pidx.names) - self.assert_eq(psidx, pidx) - self.assertEqual(psdf.index.name, pdf.index.name) - self.assertEqual(psdf.index.names, pdf.index.names) - self.assertEqual(psser.index.names, pser.index.names) + kidx.name = "renamed" + self.assertEqual(kidx.name, pidx.name) + self.assertEqual(kidx.names, pidx.names) + self.assert_eq(kidx, pidx) + self.assertEqual(kdf.index.name, pdf.index.name) + self.assertEqual(kdf.index.names, pdf.index.names) + self.assertEqual(kser.index.names, pser.index.names) pidx.name = None - psidx.name = None - self.assertEqual(psidx.name, pidx.name) - self.assertEqual(psidx.names, pidx.names) - self.assert_eq(psidx, pidx) - self.assertEqual(psdf.index.name, pdf.index.name) - self.assertEqual(psdf.index.names, pdf.index.names) - self.assertEqual(psser.index.names, pser.index.names) + kidx.name = None + self.assertEqual(kidx.name, pidx.name) + self.assertEqual(kidx.names, pidx.names) + self.assert_eq(kidx, pidx) + self.assertEqual(kdf.index.name, pdf.index.name) + self.assertEqual(kdf.index.names, pdf.index.names) + self.assertEqual(kser.index.names, pser.index.names) with self.assertRaisesRegex(ValueError, "Names must be a list-like"): - psidx.names = "hi" + kidx.names = "hi" expected_error_message = "Length of new names must be {}, got {}".format( - psdf._internal.index_level, len(["0", "1"]) + kdf._internal.index_level, len(["0", "1"]) ) with self.assertRaisesRegex(ValueError, expected_error_message): - psidx.names = ["0", "1"] + kidx.names = ["0", "1"] expected_error_message = "Index.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): ps.Index([1, 2, 3], name=["0", "1"]) with self.assertRaisesRegex(TypeError, expected_error_message): - psidx.name = ["renamed"] + kidx.name = ["renamed"] with self.assertRaisesRegex(TypeError, expected_error_message): - psidx.name = ["0", "1"] + kidx.name = ["0", "1"] with self.assertRaisesRegex(TypeError, expected_error_message): ps.Index([(1, 2), (3, 4)], names=["a", ["b"]]) @@ -266,143 +266,143 @@ def test_multi_index_names(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.names, pdf.index.names) + self.assertEqual(kdf.index.names, pdf.index.names) pidx = pdf.index - psidx = psdf.index + kidx = kdf.index pidx.names = ["renamed_number", "renamed_color"] - psidx.names = ["renamed_number", "renamed_color"] - self.assertEqual(psidx.names, pidx.names) + kidx.names = ["renamed_number", "renamed_color"] + self.assertEqual(kidx.names, pidx.names) pidx.names = ["renamed_number", None] - psidx.names = ["renamed_number", None] - self.assertEqual(psidx.names, pidx.names) - self.assert_eq(psidx, pidx) + kidx.names = ["renamed_number", None] + self.assertEqual(kidx.names, pidx.names) + self.assert_eq(kidx, pidx) with self.assertRaises(PandasNotImplementedError): - psidx.name + kidx.name with self.assertRaises(PandasNotImplementedError): - psidx.name = "renamed" + kidx.name = "renamed" def test_index_rename(self): pdf = pd.DataFrame( np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") ) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pidx = pdf.index - psidx = psdf.index + kidx = kdf.index - self.assert_eq(psidx.rename("y"), pidx.rename("y")) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kidx.rename("y"), pidx.rename("y")) + self.assert_eq(kdf.index.names, pdf.index.names) # non-string names - self.assert_eq(psidx.rename(0), pidx.rename(0)) - self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0))) + self.assert_eq(kidx.rename(0), pidx.rename(0)) + self.assert_eq(kidx.rename(("y", 0)), pidx.rename(("y", 0))) - psidx.rename("z", inplace=True) + kidx.rename("z", inplace=True) pidx.rename("z", inplace=True) - self.assert_eq(psidx, pidx) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kidx, pidx) + self.assert_eq(kdf.index.names, pdf.index.names) - self.assert_eq(psidx.rename(None), pidx.rename(None)) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kidx.rename(None), pidx.rename(None)) + self.assert_eq(kdf.index.names, pdf.index.names) - self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"])) + self.assertRaises(TypeError, lambda: kidx.rename(["x", "y"])) def test_multi_index_rename(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pmidx = pdf.index - psmidx = psdf.index + kmidx = kdf.index - self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) + self.assert_eq(kdf.index.names, pdf.index.names) # non-string names - self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1])) + self.assert_eq(kmidx.rename([0, 1]), pmidx.rename([0, 1])) self.assert_eq( - psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) + kmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) ) - psmidx.rename(["num", "col"], inplace=True) + kmidx.rename(["num", "col"], inplace=True) pmidx.rename(["num", "col"], inplace=True) - self.assert_eq(psmidx, pmidx) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kmidx, pmidx) + self.assert_eq(kdf.index.names, pdf.index.names) - self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None])) - self.assert_eq(psdf.index.names, pdf.index.names) + self.assert_eq(kmidx.rename([None, None]), pmidx.rename([None, None])) + self.assert_eq(kdf.index.names, pdf.index.names) - self.assertRaises(TypeError, lambda: psmidx.rename("number")) - self.assertRaises(TypeError, lambda: psmidx.rename(None)) - self.assertRaises(ValueError, lambda: psmidx.rename(["number"])) + self.assertRaises(TypeError, lambda: kmidx.rename("number")) + self.assertRaises(TypeError, lambda: kmidx.rename(None)) + self.assertRaises(ValueError, lambda: kmidx.rename(["number"])) def test_multi_index_levshape(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psidx = ps.from_pandas(pidx) - self.assertEqual(pidx.levshape, psidx.levshape) + kidx = ps.from_pandas(pidx) + self.assertEqual(pidx.levshape, kidx.levshape) def test_index_unique(self): - psidx = self.psdf.index + kidx = self.kdf.index # here the output is different than pandas in terms of order expected = [0, 1, 3, 5, 6, 8, 9] - self.assert_eq(expected, sorted(psidx.unique().to_pandas())) - self.assert_eq(expected, sorted(psidx.unique(level=0).to_pandas())) + self.assert_eq(expected, sorted(kidx.unique().to_pandas())) + self.assert_eq(expected, sorted(kidx.unique(level=0).to_pandas())) expected = [1, 2, 4, 6, 7, 9, 10] - self.assert_eq(expected, sorted((psidx + 1).unique().to_pandas())) + self.assert_eq(expected, sorted((kidx + 1).unique().to_pandas())) with self.assertRaisesRegex(IndexError, "Too many levels*"): - psidx.unique(level=1) + kidx.unique(level=1) with self.assertRaisesRegex(KeyError, "Requested level (hi)*"): - psidx.unique(level="hi") + kidx.unique(level="hi") def test_multi_index_copy(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq(psdf.index.copy(), pdf.index.copy()) + self.assert_eq(kdf.index.copy(), pdf.index.copy()) def test_drop_duplicates(self): pidx = pd.Index([4, 2, 4, 1, 4, 3]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.drop_duplicates().sort_values(), pidx.drop_duplicates().sort_values()) + self.assert_eq(kidx.drop_duplicates().sort_values(), pidx.drop_duplicates().sort_values()) self.assert_eq( - (psidx + 1).drop_duplicates().sort_values(), (pidx + 1).drop_duplicates().sort_values() + (kidx + 1).drop_duplicates().sort_values(), (pidx + 1).drop_duplicates().sort_values() ) def test_dropna(self): pidx = pd.Index([np.nan, 2, 4, 1, np.nan, 3]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.dropna(), pidx.dropna()) - self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna()) + self.assert_eq(kidx.dropna(), pidx.dropna()) + self.assert_eq((kidx + 1).dropna(), (pidx + 1).dropna()) def test_index_symmetric_difference(self): pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([2, 3, 4, 5]) - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) self.assert_eq( - psidx1.symmetric_difference(psidx2).sort_values(), + kidx1.symmetric_difference(kidx2).sort_values(), pidx1.symmetric_difference(pidx2).sort_values(), ) self.assert_eq( - (psidx1 + 1).symmetric_difference(psidx2).sort_values(), + (kidx1 + 1).symmetric_difference(kidx2).sort_values(), (pidx1 + 1).symmetric_difference(pidx2).sort_values(), ) @@ -414,11 +414,11 @@ def test_index_symmetric_difference(self): [["koalas", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) + kmidx1 = ps.from_pandas(pmidx1) + kmidx2 = ps.from_pandas(pmidx2) self.assert_eq( - psmidx1.symmetric_difference(psmidx2).sort_values(), + kmidx1.symmetric_difference(kmidx2).sort_values(), pmidx1.symmetric_difference(pmidx2).sort_values(), ) @@ -442,7 +442,7 @@ def test_multi_index_symmetric_difference(self): midx.symmetric_difference(idx) def test_missing(self): - psdf = ps.DataFrame( + kdf = ps.DataFrame( { "a": [1, 2, 3], "b": [4, 5, 6], @@ -461,7 +461,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("a").index, name)() + getattr(kdf.set_index("a").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -470,7 +470,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index("a").index, name)() + getattr(kdf.set_index("a").index, name)() # MultiIndex functions missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction) @@ -482,7 +482,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index(["a", "b"]).index, name)() + getattr(kdf.set_index(["a", "b"]).index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -491,7 +491,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index(["a", "b"]).index, name)() + getattr(kdf.set_index(["a", "b"]).index, name)() # DatetimeIndex functions missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction) @@ -503,7 +503,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("c").index, name)() + getattr(kdf.set_index("c").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -512,7 +512,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index("c").index, name)() + getattr(kdf.set_index("c").index, name)() # CategoricalIndex functions missing_functions = inspect.getmembers( @@ -526,7 +526,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("d").index, name)() + getattr(kdf.set_index("d").index, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -535,7 +535,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index("d").index, name)() + getattr(kdf.set_index("d").index, name)() # Index properties missing_properties = inspect.getmembers( @@ -551,7 +551,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("a").index, name) + getattr(kdf.set_index("a").index, name) deprecated_properties = [ name @@ -562,7 +562,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index("a").index, name) + getattr(kdf.set_index("a").index, name) # MultiIndex properties missing_properties = inspect.getmembers( @@ -578,7 +578,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index(["a", "b"]).index, name) + getattr(kdf.set_index(["a", "b"]).index, name) deprecated_properties = [ name @@ -589,7 +589,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) ): - getattr(psdf.set_index(["a", "b"]).index, name) + getattr(kdf.set_index(["a", "b"]).index, name) # DatetimeIndex properties missing_properties = inspect.getmembers( @@ -605,7 +605,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("c").index, name) + getattr(kdf.set_index("c").index, name) # CategoricalIndex properties missing_properties = inspect.getmembers( @@ -621,7 +621,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psdf.set_index("d").index, name) + getattr(kdf.set_index("d").index, name) def test_index_has_duplicates(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] @@ -630,9 +630,9 @@ def test_index_has_duplicates(self): for idx, name, expected in zip(indexes, names, has_dup): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name)) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.has_duplicates, expected) + self.assertEqual(kdf.index.has_duplicates, expected) def test_multiindex_has_duplicates(self): indexes = [ @@ -645,172 +645,170 @@ def test_multiindex_has_duplicates(self): for idx, expected in zip(indexes, has_dup): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.has_duplicates, expected) + self.assertEqual(kdf.index.has_duplicates, expected) def test_multi_index_not_supported(self): - psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + kdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"): - psdf.set_index(["a", "b"]).index.any() + kdf.set_index(["a", "b"]).index.any() with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"): - psdf.set_index(["a", "b"]).index.all() + kdf.set_index(["a", "b"]).index.all() def test_index_nlevels(self): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.nlevels, 1) + self.assertEqual(kdf.index.nlevels, 1) def test_multiindex_nlevel(self): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")]) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.nlevels, 2) + self.assertEqual(kdf.index.nlevels, 2) def test_multiindex_from_arrays(self): arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]] pidx = pd.MultiIndex.from_arrays(arrays) - psidx = ps.MultiIndex.from_arrays(arrays) + kidx = ps.MultiIndex.from_arrays(arrays) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) def test_multiindex_swaplevel(self): pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1)) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) - self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(-2, -1), kidx.swaplevel(-2, -1)) + self.assert_eq(pidx.swaplevel(0, 1), kidx.swaplevel(0, 1)) + self.assert_eq(pidx.swaplevel("word", 1), kidx.swaplevel("word", 1)) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(-3, "word") + kidx.swaplevel(-3, "word") with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(0, 2) + kidx.swaplevel(0, 2) with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(0, -3) + kidx.swaplevel(0, -3) with self.assertRaisesRegex(KeyError, "Level work not found"): - psidx.swaplevel(0, "work") + kidx.swaplevel(0, "work") def test_multiindex_droplevel(self): pidx = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"] ) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"): - psidx.droplevel(4) + kidx.droplevel(4) with self.assertRaisesRegex(KeyError, "Level level4 not found"): - psidx.droplevel("level4") + kidx.droplevel("level4") with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"): - psidx.droplevel([("level3", "level4")]) + kidx.droplevel([("level3", "level4")]) with self.assertRaisesRegex( ValueError, "Cannot remove 4 levels from an index with 3 levels: at least one " "level must be left.", ): - psidx.droplevel([0, 0, 1, 2]) + kidx.droplevel([0, 0, 1, 2]) with self.assertRaisesRegex( ValueError, "Cannot remove 3 levels from an index with 3 levels: at least one " "level must be left.", ): - psidx.droplevel([0, 1, 2]) + kidx.droplevel([0, 1, 2]) - self.assert_eq(pidx.droplevel(0), psidx.droplevel(0)) - self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1])) - self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1))) - self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"])) - self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2"))) + self.assert_eq(pidx.droplevel(0), kidx.droplevel(0)) + self.assert_eq(pidx.droplevel([0, 1]), kidx.droplevel([0, 1])) + self.assert_eq(pidx.droplevel((0, 1)), kidx.droplevel((0, 1))) + self.assert_eq(pidx.droplevel([0, "level2"]), kidx.droplevel([0, "level2"])) + self.assert_eq(pidx.droplevel((0, "level2")), kidx.droplevel((0, "level2"))) # non-string names pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0)) - self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0])) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.droplevel(1.0), kidx.droplevel(1.0)) + self.assert_eq(pidx.droplevel([0, 2.0]), kidx.droplevel([0, 2.0])) def test_index_fillna(self): pidx = pd.Index([1, 2, None]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True) - self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True) + self.assert_eq(pidx.fillna(0), kidx.fillna(0), almost=True) + self.assert_eq(pidx.rename("name").fillna(0), kidx.rename("name").fillna(0), almost=True) with self.assertRaisesRegex(TypeError, "Unsupported type list"): - psidx.fillna([1, 2]) + kidx.fillna([1, 2]) def test_index_drop(self): pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop(1), psidx.drop(1)) - self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2])) - self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3])) + self.assert_eq(pidx.drop(1), kidx.drop(1)) + self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2])) + self.assert_eq((pidx + 1).drop([2, 3]), (kidx + 1).drop([2, 3])) def test_multiindex_drop(self): pidx = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] ) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop("a"), psidx.drop("a")) - self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"])) - self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1)) - self.assert_eq( - pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2") - ) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop("a"), kidx.drop("a")) + self.assert_eq(pidx.drop(["a", "b"]), kidx.drop(["a", "b"])) + self.assert_eq(pidx.drop(["x", "y"], level=1), kidx.drop(["x", "y"], level=1)) + self.assert_eq(pidx.drop(["x", "y"], level="level2"), kidx.drop(["x", "y"], level="level2")) pidx.names = ["lv1", "lv2"] - psidx.names = ["lv1", "lv2"] - self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2")) + kidx.names = ["lv1", "lv2"] + self.assert_eq(pidx.drop(["x", "y"], level="lv2"), kidx.drop(["x", "y"], level="lv2")) - self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2)) - self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level")) + self.assertRaises(IndexError, lambda: kidx.drop(["a", "b"], level=2)) + self.assertRaises(KeyError, lambda: kidx.drop(["a", "b"], level="level")) - psidx.names = ["lv", "lv"] - self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv")) + kidx.names = ["lv", "lv"] + self.assertRaises(ValueError, lambda: kidx.drop(["x", "y"], level="lv")) def test_sort_values(self): pidx = pd.Index([-10, -100, 200, 100]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.sort_values(), psidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx.name = "koalas" - psidx.name = "koalas" + kidx.name = "koalas" - self.assert_eq(pidx.sort_values(), psidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) pidx.names = ["hello", "koalas", "goodbye"] - psidx.names = ["hello", "koalas", "goodbye"] + kidx.names = ["hello", "koalas", "goodbye"] - self.assert_eq(pidx.sort_values(), psidx.sort_values()) - self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False)) + self.assert_eq(pidx.sort_values(), kidx.sort_values()) + self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) def test_index_drop_duplicates(self): pidx = pd.Index([1, 1, 2]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), kidx.drop_duplicates().sort_values()) def test_index_sort(self): idx = ps.Index([1, 2, 3, 4, 5]) @@ -826,208 +824,204 @@ def test_index_sort(self): midx.sort() def test_multiindex_isna(self): - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - psidx.isna() + kidx.isna() with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - psidx.isnull() + kidx.isnull() with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - psidx.notna() + kidx.notna() with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - psidx.notnull() + kidx.notnull() def test_index_nunique(self): pidx = pd.Index([1, 1, 2, None]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.nunique(), psidx.nunique()) - self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True)) + self.assert_eq(pidx.nunique(), kidx.nunique()) + self.assert_eq(pidx.nunique(dropna=True), kidx.nunique(dropna=True)) def test_multiindex_nunique(self): - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - psidx.notnull() + kidx.notnull() def test_multiindex_rename(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) pidx = pidx.rename(list("ABC")) - psidx = psidx.rename(list("ABC")) - self.assert_eq(pidx, psidx) + kidx = kidx.rename(list("ABC")) + self.assert_eq(pidx, kidx) pidx = pidx.rename(["my", "name", "is"]) - psidx = psidx.rename(["my", "name", "is"]) - self.assert_eq(pidx, psidx) + kidx = kidx.rename(["my", "name", "is"]) + self.assert_eq(pidx, kidx) def test_multiindex_set_names(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) pidx = pidx.set_names(["set", "new", "names"]) - psidx = psidx.set_names(["set", "new", "names"]) - self.assert_eq(pidx, psidx) + kidx = kidx.set_names(["set", "new", "names"]) + self.assert_eq(pidx, kidx) pidx.set_names(["set", "new", "names"], inplace=True) - psidx.set_names(["set", "new", "names"], inplace=True) - self.assert_eq(pidx, psidx) + kidx.set_names(["set", "new", "names"], inplace=True) + self.assert_eq(pidx, kidx) pidx = pidx.set_names("first", level=0) - psidx = psidx.set_names("first", level=0) - self.assert_eq(pidx, psidx) + kidx = kidx.set_names("first", level=0) + self.assert_eq(pidx, kidx) pidx = pidx.set_names("second", level=1) - psidx = psidx.set_names("second", level=1) - self.assert_eq(pidx, psidx) + kidx = kidx.set_names("second", level=1) + self.assert_eq(pidx, kidx) pidx = pidx.set_names("third", level=2) - psidx = psidx.set_names("third", level=2) - self.assert_eq(pidx, psidx) + kidx = kidx.set_names("third", level=2) + self.assert_eq(pidx, kidx) pidx.set_names("first", level=0, inplace=True) - psidx.set_names("first", level=0, inplace=True) - self.assert_eq(pidx, psidx) + kidx.set_names("first", level=0, inplace=True) + self.assert_eq(pidx, kidx) pidx.set_names("second", level=1, inplace=True) - psidx.set_names("second", level=1, inplace=True) - self.assert_eq(pidx, psidx) + kidx.set_names("second", level=1, inplace=True) + self.assert_eq(pidx, kidx) pidx.set_names("third", level=2, inplace=True) - psidx.set_names("third", level=2, inplace=True) - self.assert_eq(pidx, psidx) + kidx.set_names("third", level=2, inplace=True) + self.assert_eq(pidx, kidx) def test_multiindex_from_tuples(self): tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] pidx = pd.MultiIndex.from_tuples(tuples) - psidx = ps.MultiIndex.from_tuples(tuples) + kidx = ps.MultiIndex.from_tuples(tuples) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) def test_multiindex_from_product(self): iterables = [[0, 1, 2], ["green", "purple"]] pidx = pd.MultiIndex.from_product(iterables) - psidx = ps.MultiIndex.from_product(iterables) + kidx = ps.MultiIndex.from_product(iterables) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) def test_multiindex_tuple_column_name(self): column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels) pdf.set_index(("a", "x"), append=True, inplace=True) - psdf = ps.from_pandas(pdf) - self.assert_eq(pdf, psdf) + kdf = ps.from_pandas(pdf) + self.assert_eq(pdf, kdf) def test_len(self): pidx = pd.Index(range(10000)) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(len(pidx), len(psidx)) + self.assert_eq(len(pidx), len(kidx)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - self.assert_eq(len(pidx), len(psidx)) + self.assert_eq(len(pidx), len(kidx)) def test_delete(self): pidx = pd.Index([10, 9, 8, 7, 6, 7, 8, 9, 10]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.delete(8).sort_values(), psidx.delete(8).sort_values()) - self.assert_eq(pidx.delete(-9).sort_values(), psidx.delete(-9).sort_values()) - self.assert_eq( - pidx.delete([-9, 0, 8]).sort_values(), psidx.delete([-9, 0, 8]).sort_values() - ) + self.assert_eq(pidx.delete(8).sort_values(), kidx.delete(8).sort_values()) + self.assert_eq(pidx.delete(-9).sort_values(), kidx.delete(-9).sort_values()) + self.assert_eq(pidx.delete([-9, 0, 8]).sort_values(), kidx.delete([-9, 0, 8]).sort_values()) with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"): - psidx.delete([0, 9]) + kidx.delete([0, 9]) with self.assertRaisesRegex( IndexError, "index -10 is out of bounds for axis 0 with size 9" ): - psidx.delete([-10, 0]) + kidx.delete([-10, 0]) with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"): - psidx.delete(9) + kidx.delete(9) with self.assertRaisesRegex( IndexError, "index -10 is out of bounds for axis 0 with size 9" ): - psidx.delete(-10) + kidx.delete(-10) # MultiIndex pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - self.assert_eq(pidx.delete(2).sort_values(), psidx.delete(2).sort_values()) - self.assert_eq(pidx.delete(-3).sort_values(), psidx.delete(-3).sort_values()) - self.assert_eq( - pidx.delete([-3, 0, 2]).sort_values(), psidx.delete([-3, 0, 2]).sort_values() - ) + self.assert_eq(pidx.delete(2).sort_values(), kidx.delete(2).sort_values()) + self.assert_eq(pidx.delete(-3).sort_values(), kidx.delete(-3).sort_values()) + self.assert_eq(pidx.delete([-3, 0, 2]).sort_values(), kidx.delete([-3, 0, 2]).sort_values()) with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"): - psidx.delete([0, 3]) + kidx.delete([0, 3]) with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"): - psidx.delete([-4, 0]) + kidx.delete([-4, 0]) with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"): - psidx.delete(3) + kidx.delete(3) with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"): - psidx.delete(-4) + kidx.delete(-4) def test_append(self): # Index pidx = pd.Index(range(10000)) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.append(pidx), psidx.append(psidx)) + self.assert_eq(pidx.append(pidx), kidx.append(kidx)) # Index with name pidx1 = pd.Index(range(10000), name="a") pidx2 = pd.Index(range(10000), name="b") - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) - self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}, index=["x", "y", "z"]) - psdf1 = ps.from_pandas(pdf1) - psdf2 = ps.from_pandas(pdf2) + kdf1 = ps.from_pandas(pdf1) + kdf2 = ps.from_pandas(pdf2) pidx1 = pdf1.set_index("a").index pidx2 = pdf2.set_index("d").index - psidx1 = psdf1.set_index("a").index - psidx2 = psdf2.set_index("d").index + kidx1 = kdf1.set_index("a").index + kidx2 = kdf2.set_index("d").index - self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame with MultiIndex columns pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}) pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")]) - psdf1 = ps.from_pandas(pdf1) - psdf2 = ps.from_pandas(pdf2) + kdf1 = ps.from_pandas(pdf1) + kdf2 = ps.from_pandas(pdf2) pidx1 = pdf1.set_index(("a", "x")).index pidx2 = pdf2.set_index(("d", "y")).index - psidx1 = psdf1.set_index(("a", "x")).index - psidx2 = psdf2.set_index(("d", "y")).index + kidx1 = kdf1.set_index(("a", "x")).index + kidx2 = kdf2.set_index(("d", "y")).index - self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2)) + self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) - self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1)) + self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.append(pmidx), psmidx.append(psmidx)) + self.assert_eq(pmidx.append(pmidx), kmidx.append(kmidx)) # MultiIndex with names pmidx1 = pd.MultiIndex.from_tuples( @@ -1036,83 +1030,83 @@ def test_append(self): pmidx2 = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"] ) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) + kmidx1 = ps.from_pandas(pmidx1) + kmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.append(pmidx2), psmidx1.append(psmidx2)) + self.assert_eq(pmidx1.append(pmidx2), kmidx1.append(kmidx2)) - self.assert_eq(pmidx2.append(pmidx1), psmidx2.append(psmidx1)) + self.assert_eq(pmidx2.append(pmidx1), kmidx2.append(kmidx1)) - self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) - self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names) + self.assert_eq(pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) # Index & MultiIndex currently is not supported expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported" with self.assertRaisesRegex(NotImplementedError, expected_error_message): - psidx.append(psmidx) + kidx.append(kmidx) with self.assertRaisesRegex(NotImplementedError, expected_error_message): - psmidx.append(psidx) + kmidx.append(kidx) def test_argmin(self): pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.argmin(), psidx.argmin()) + self.assert_eq(pidx.argmin(), kidx.argmin()) # MultiIndex - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmin' not allowed for this dtype" ): - psidx.argmin() + kidx.argmin() def test_argmax(self): pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.argmax(), psidx.argmax()) + self.assert_eq(pidx.argmax(), kidx.argmax()) # MultiIndex - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + kidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmax' not allowed for this dtype" ): - psidx.argmax() + kidx.argmax() def test_min(self): pidx = pd.Index([3, 2, 1]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.min(), psidx.min()) + self.assert_eq(pidx.min(), kidx.min()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.min(), psmidx.min()) + self.assert_eq(pmidx.min(), kmidx.min()) pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.min(), psidx.min()) + self.assert_eq(pidx.min(), kidx.min()) def test_max(self): pidx = pd.Index([3, 2, 1]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.max(), psidx.max()) + self.assert_eq(pidx.max(), kidx.max()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.max(), psmidx.max()) + self.assert_eq(pmidx.max(), kmidx.max()) pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.max(), psidx.max()) + self.assert_eq(pidx.max(), kidx.max()) def test_monotonic(self): # test monotonic_increasing & monotonic_decreasing for MultiIndex. @@ -1180,9 +1174,9 @@ def test_monotonic(self): for data in datas: with self.subTest(data=data): pmidx = pd.MultiIndex.from_tuples(data) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) # datas below return different result depends on pandas version. # Because the behavior of handling null values is changed in pandas >= 1.1.4. @@ -1215,13 +1209,13 @@ def test_monotonic(self): for data in datas: with self.subTest(data=data): pmidx = pd.MultiIndex.from_tuples(data) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) if LooseVersion(pd.__version__) < LooseVersion("1.1.4"): - self.assert_eq(psmidx.is_monotonic_increasing, False) - self.assert_eq(psmidx.is_monotonic_decreasing, False) + self.assert_eq(kmidx.is_monotonic_increasing, False) + self.assert_eq(kmidx.is_monotonic_decreasing, False) else: - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) # The datas below are tested another way since they cannot be an arguments for # `MultiIndex.from_tuples` in pandas >= 1.1.0. @@ -1230,105 +1224,104 @@ def test_monotonic(self): pmidx = pd.MultiIndex.from_tuples( [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] ) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.is_monotonic_increasing, False) - self.assert_eq(psmidx.is_monotonic_decreasing, False) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(kmidx.is_monotonic_increasing, False) + self.assert_eq(kmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples( [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] ) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.is_monotonic_increasing, False) - self.assert_eq(psmidx.is_monotonic_decreasing, False) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(kmidx.is_monotonic_increasing, False) + self.assert_eq(kmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples( [(None, None), (None, None), (None, None), (None, None), (None, None)] ) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.is_monotonic_increasing, False) - self.assert_eq(psmidx.is_monotonic_decreasing, False) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(kmidx.is_monotonic_increasing, False) + self.assert_eq(kmidx.is_monotonic_decreasing, False) pmidx = pd.MultiIndex.from_tuples([(None, None)]) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.is_monotonic_increasing, False) - self.assert_eq(psmidx.is_monotonic_decreasing, False) - - else: - [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] - psdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]}) - psdf["b"] = None - psmidx = psdf.set_index(["a", "b"]).index - pmidx = psmidx.to_pandas() - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - - [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] - psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]}) - psdf["a"] = None - psmidx = psdf.set_index(["a", "b"]).index - pmidx = psmidx.to_pandas() - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - - [(None, None), (None, None), (None, None), (None, None), (None, None)] - psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]}) - psdf["a"] = None - psdf["b"] = None - psmidx = psdf.set_index(["a", "b"]).index - pmidx = psmidx.to_pandas() - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) - [(None, None)] - psdf = ps.DataFrame({"a": [1], "b": [1]}) - psdf["a"] = None - psdf["b"] = None - psmidx = psdf.set_index(["a", "b"]).index - pmidx = psmidx.to_pandas() - self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) - self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(kmidx.is_monotonic_increasing, False) + self.assert_eq(kmidx.is_monotonic_decreasing, False) + + # Disable the test cases below because pandas returns `True` or `False` randomly. + # else: + # [(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)] + # kdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]}) + # kdf["b"] = None + # kmidx = kdf.set_index(["a", "b"]).index + # pmidx = kmidx.to_pandas() + # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + + # [(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")] + # kdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]}) + # kdf["a"] = None + # kmidx = kdf.set_index(["a", "b"]).index + # pmidx = kmidx.to_pandas() + # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + + # [(None, None), (None, None), (None, None), (None, None), (None, None)] + # kdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]}) + # kdf["a"] = None + # kdf["b"] = None + # kmidx = kdf.set_index(["a", "b"]).index + # pmidx = kmidx.to_pandas() + # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) + # [(None, None)] + # kdf = ps.DataFrame({"a": [1], "b": [1]}) + # kdf["a"] = None + # kdf["b"] = None + # kmidx = kdf.set_index(["a", "b"]).index + # pmidx = kmidx.to_pandas() + # self.assert_eq(kmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing) + # self.assert_eq(kmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing) def test_difference(self): # Index pidx1 = pd.Index([1, 2, 3, 4], name="koalas") pidx2 = pd.Index([3, 4, 5, 6], name="koalas") - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) + self.assert_eq(kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( - psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values() - ) - self.assert_eq( - psidx1.difference([3, 4, 5, 6]).sort_values(), + kidx1.difference([3, 4, 5, 6]).sort_values(), pidx1.difference([3, 4, 5, 6]).sort_values(), ) self.assert_eq( - psidx1.difference((3, 4, 5, 6)).sort_values(), + kidx1.difference((3, 4, 5, 6)).sort_values(), pidx1.difference((3, 4, 5, 6)).sort_values(), ) self.assert_eq( - psidx1.difference({3, 4, 5, 6}).sort_values(), + kidx1.difference({3, 4, 5, 6}).sort_values(), pidx1.difference({3, 4, 5, 6}).sort_values(), ) self.assert_eq( - psidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), + kidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), pidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(), ) # Exceptions for Index with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx1.difference("1234") + kidx1.difference("1234") with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx1.difference(1234) + kidx1.difference(1234) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx1.difference(12.34) + kidx1.difference(12.34) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx1.difference(None) + kidx1.difference(None) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx1.difference(np.nan) + kidx1.difference(np.nan) with self.assertRaisesRegex( ValueError, "The 'sort' keyword only takes the values of None or True; 1 was passed." ): - psidx1.difference(psidx2, sort=1) + kidx1.difference(kidx2, sort=1) # MultiIndex pidx1 = pd.MultiIndex.from_tuples( @@ -1337,150 +1330,148 @@ def test_difference(self): pidx2 = pd.MultiIndex.from_tuples( [("a", "x", 1), ("b", "z", 2), ("k", "z", 3)], names=["hello", "koalas", "world"] ) - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) + self.assert_eq(kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( - psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values() - ) - self.assert_eq( - psidx1.difference({("a", "x", 1)}).sort_values(), + kidx1.difference({("a", "x", 1)}).sort_values(), pidx1.difference({("a", "x", 1)}).sort_values(), ) self.assert_eq( - psidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), + kidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), pidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(), ) # Exceptions for MultiIndex with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - psidx1.difference(["b", "z", "2"]) + kidx1.difference(["b", "z", "2"]) def test_repeat(self): pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) - self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) - self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) + self.assert_eq(kidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) + self.assert_eq(kidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) + self.assert_eq((kidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) - self.assertRaises(ValueError, lambda: psidx.repeat(-1)) - self.assertRaises(TypeError, lambda: psidx.repeat("abc")) + self.assertRaises(ValueError, lambda: kidx.repeat(-1)) + self.assertRaises(ValueError, lambda: kidx.repeat("abc")) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) - self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) + self.assert_eq(kmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) + self.assert_eq(kmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) - self.assertRaises(ValueError, lambda: psmidx.repeat(-1)) - self.assertRaises(TypeError, lambda: psmidx.repeat("abc")) + self.assertRaises(ValueError, lambda: kmidx.repeat(-1)) + self.assertRaises(ValueError, lambda: kmidx.repeat("abc")) def test_unique(self): pidx = pd.Index(["a", "b", "a"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values()) - self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values()) + self.assert_eq(kidx.unique().sort_values(), pidx.unique().sort_values()) + self.assert_eq(kidx.unique().sort_values(), pidx.unique().sort_values()) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) - self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) + self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values()) + self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values()) def test_asof(self): # Increasing values pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) - self.assert_eq(psidx.asof("2014-01-04"), pidx.asof("2014-01-04")) + self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(repr(kidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) + self.assert_eq(kidx.asof("2014-01-04"), pidx.asof("2014-01-04")) pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) + self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(repr(kidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02"))) # Decreasing values pidx = pd.Index(["2014-01-03", "2014-01-02", "2013-12-31"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02")) - self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) + self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + self.assert_eq(kidx.asof("1999-01-02"), pidx.asof("1999-01-02")) + self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) pidx = pd.DatetimeIndex(["2014-01-03", "2014-01-02", "2013-12-31"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) # TODO: a pandas bug? - # self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01")) - # self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02")) - # self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02")) - # self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) - self.assert_eq(psidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00")) - self.assert_eq(psidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00")) - self.assert_eq(psidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00")) - self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pd.NaT)) + # self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01")) + # self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02")) + # self.assert_eq(kidx.asof("1999-01-02"), pidx.asof("1999-01-02")) + # self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02"))) + self.assert_eq(kidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00")) + self.assert_eq(kidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00")) + self.assert_eq(kidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00")) + self.assert_eq(repr(kidx.asof("2015-01-02")), repr(pd.NaT)) # Not increasing, neither decreasing (ValueError) - psidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"]) - self.assertRaises(ValueError, lambda: psidx.asof("2013-12-31")) + kidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"]) + self.assertRaises(ValueError, lambda: kidx.asof("2013-12-31")) - psmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")]) - self.assertRaises(NotImplementedError, lambda: psmidx.asof(("a", "b"))) + kmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")]) + self.assertRaises(NotImplementedError, lambda: kmidx.asof(("a", "b"))) def test_union(self): # Index pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([3, 4, 5, 6]) - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) - self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) - self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) - self.assert_eq(psidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True) - self.assert_eq(psidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True) + self.assert_eq(kidx1.union(kidx2), pidx1.union(pidx2)) + self.assert_eq(kidx2.union(kidx1), pidx2.union(pidx1)) + self.assert_eq(kidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True) + self.assert_eq(kidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True) self.assert_eq( - psidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True + kidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True ) self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True + kidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True ) # Testing if the result is correct after sort=False. # The `sort` argument is added in pandas 0.24. if LooseVersion(pd.__version__) >= LooseVersion("0.24"): self.assert_eq( - psidx1.union(psidx2, sort=False).sort_values(), + kidx1.union(kidx2, sort=False).sort_values(), pidx1.union(pidx2, sort=False).sort_values(), ) self.assert_eq( - psidx2.union(psidx1, sort=False).sort_values(), + kidx2.union(kidx1, sort=False).sort_values(), pidx2.union(pidx1, sort=False).sort_values(), ) self.assert_eq( - psidx1.union([3, 4, 5, 6], sort=False).sort_values(), + kidx1.union([3, 4, 5, 6], sort=False).sort_values(), pidx1.union([3, 4, 5, 6], sort=False).sort_values(), almost=True, ) self.assert_eq( - psidx2.union([1, 2, 3, 4], sort=False).sort_values(), + kidx2.union([1, 2, 3, 4], sort=False).sort_values(), pidx2.union([1, 2, 3, 4], sort=False).sort_values(), almost=True, ) self.assert_eq( - psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), + kidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(), almost=True, ) self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), + kidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(), almost=True, ) @@ -1489,26 +1480,26 @@ def test_union(self): if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4]) pidx2 = pd.Index([3, 4, 3, 4, 5, 6]) - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + kidx1 = ps.from_pandas(pidx1) + kidx2 = ps.from_pandas(pidx2) - self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) - self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) + self.assert_eq(kidx1.union(kidx2), pidx1.union(pidx2)) + self.assert_eq(kidx2.union(kidx1), pidx2.union(pidx1)) self.assert_eq( - psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True + kidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True ) self.assert_eq( - psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), + kidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), almost=True, ) self.assert_eq( - psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), + kidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])), almost=True, ) self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), + kidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])), almost=True, ) @@ -1518,29 +1509,29 @@ def test_union(self): pmidx2 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]) pmidx3 = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]) pmidx4 = pd.MultiIndex.from_tuples([(1, 3), (1, 4), (1, 5), (1, 6)]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - psmidx3 = ps.from_pandas(pmidx3) - psmidx4 = ps.from_pandas(pmidx4) - - self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2)) - self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1)) - self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4)) - self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3)) + kmidx1 = ps.from_pandas(pmidx1) + kmidx2 = ps.from_pandas(pmidx2) + kmidx3 = ps.from_pandas(pmidx3) + kmidx4 = ps.from_pandas(pmidx4) + + self.assert_eq(kmidx1.union(kmidx2), pmidx1.union(pmidx2)) + self.assert_eq(kmidx2.union(kmidx1), pmidx2.union(pmidx1)) + self.assert_eq(kmidx3.union(kmidx4), pmidx3.union(pmidx4)) + self.assert_eq(kmidx4.union(kmidx3), pmidx4.union(pmidx3)) self.assert_eq( - psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), + kmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), ) self.assert_eq( - psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), + kmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), ) self.assert_eq( - psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), + kmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), ) self.assert_eq( - psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), + kmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), ) @@ -1548,23 +1539,23 @@ def test_union(self): # The `sort` argument is added in pandas 0.24. if LooseVersion(pd.__version__) >= LooseVersion("0.24"): self.assert_eq( - psmidx1.union(psmidx2, sort=False).sort_values(), + kmidx1.union(kmidx2, sort=False).sort_values(), pmidx1.union(pmidx2, sort=False).sort_values(), ) self.assert_eq( - psmidx2.union(psmidx1, sort=False).sort_values(), + kmidx2.union(kmidx1, sort=False).sort_values(), pmidx2.union(pmidx1, sort=False).sort_values(), ) self.assert_eq( - psmidx3.union(psmidx4, sort=False).sort_values(), + kmidx3.union(kmidx4, sort=False).sort_values(), pmidx3.union(pmidx4, sort=False).sort_values(), ) self.assert_eq( - psmidx4.union(psmidx3, sort=False).sort_values(), + kmidx4.union(kmidx3, sort=False).sort_values(), pmidx4.union(pmidx3, sort=False).sort_values(), ) self.assert_eq( - psmidx1.union( + kmidx1.union( [("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False ).sort_values(), pmidx1.union( @@ -1572,7 +1563,7 @@ def test_union(self): ).sort_values(), ) self.assert_eq( - psmidx2.union( + kmidx2.union( [("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False ).sort_values(), pmidx2.union( @@ -1580,11 +1571,11 @@ def test_union(self): ).sort_values(), ) self.assert_eq( - psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), + kmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), ) self.assert_eq( - psmidx4.union( + kmidx4.union( [(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False ).sort_values(), pmidx4.union( @@ -1592,331 +1583,330 @@ def test_union(self): ).sort_values(), ) - self.assertRaises(NotImplementedError, lambda: psidx1.union(psmidx1)) - self.assertRaises(TypeError, lambda: psmidx1.union(psidx1)) - self.assertRaises(TypeError, lambda: psmidx1.union(["x", "a"])) - self.assertRaises(ValueError, lambda: psidx1.union(ps.range(2))) + self.assertRaises(NotImplementedError, lambda: kidx1.union(kmidx1)) + self.assertRaises(TypeError, lambda: kmidx1.union(kidx1)) + self.assertRaises(TypeError, lambda: kmidx1.union(["x", "a"])) + self.assertRaises(ValueError, lambda: kidx1.union(ps.range(2))) def test_take(self): # Index pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values()) + self.assert_eq(kidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values()) self.assert_eq( - psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values() + kidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values() ) - self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values()) + self.assert_eq(kidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values()) self.assert_eq( - psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values() + kidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values() ) # MultiIndex pmidx = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"] ) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values()) + self.assert_eq(kmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values()) self.assert_eq( - psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values() + kmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values() ) - self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values()) + self.assert_eq(kmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values()) self.assert_eq( - psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values() + kmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values() ) # Checking the type of indices. - self.assertRaises(TypeError, lambda: psidx.take(1)) - self.assertRaises(TypeError, lambda: psidx.take("1")) - self.assertRaises(TypeError, lambda: psidx.take({1, 2})) - self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None})) - self.assertRaises(TypeError, lambda: psmidx.take(1)) - self.assertRaises(TypeError, lambda: psmidx.take("1")) - self.assertRaises(TypeError, lambda: psmidx.take({1, 2})) - self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None})) + self.assertRaises(ValueError, lambda: kidx.take(1)) + self.assertRaises(ValueError, lambda: kidx.take("1")) + self.assertRaises(ValueError, lambda: kidx.take({1, 2})) + self.assertRaises(ValueError, lambda: kidx.take({1: None, 2: None})) + self.assertRaises(ValueError, lambda: kmidx.take(1)) + self.assertRaises(ValueError, lambda: kmidx.take("1")) + self.assertRaises(ValueError, lambda: kmidx.take({1, 2})) + self.assertRaises(ValueError, lambda: kmidx.take({1: None, 2: None})) def test_index_get_level_values(self): pidx = pd.Index([1, 2, 3], name="ks") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) for level in [0, "ks"]: - self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level)) + self.assert_eq(kidx.get_level_values(level), pidx.get_level_values(level)) def test_multiindex_get_level_values(self): pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]) pmidx.names = ["level_1", "level_2"] - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) for level in [0, 1, "level_1", "level_2"]: - self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level)) + self.assert_eq(kmidx.get_level_values(level), pmidx.get_level_values(level)) def test_index_get_level_number(self): # name of two levels are the same, which is None - psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) + kdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) with self.assertRaisesRegex( ValueError, "The name None occurs multiple times, use a level number" ): - psdf.index._get_level_number(None) + kdf.index._get_level_number(None) mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) mi.names = ["level_1", "level_2"] - psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) + kdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) # level is not int and not in the level name list with self.assertRaisesRegex(KeyError, "Level lv_3 not found"): - psdf.index._get_level_number("lv_3") + kdf.index._get_level_number("lv_3") # level is int, but an invalid negative number with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - psdf.index._get_level_number(-3) + kdf.index._get_level_number(-3) # level is int, but an invalid positive number with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - psdf.index._get_level_number(3) + kdf.index._get_level_number(3) # Correct and valid inputs in numbers level_number = [-2, -1, 0, 1] outputs = [0, 1, 0, 1] for lv, output in zip(level_number, outputs): - self.assertEqual(output, psdf.index._get_level_number(lv)) + self.assertEqual(output, kdf.index._get_level_number(lv)) # Valid inputs as level names level_names = ["level_1", "level_2"] outputs = [0, 1] for lv, output in zip(level_names, outputs): - self.assertEqual(output, psdf.index._get_level_number(lv)) + self.assertEqual(output, kdf.index._get_level_number(lv)) def test_holds_integer(self): pidx = pd.Index([1, 2, 3, 4]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) pidx = pd.Index([1.1, 2.2, 3.3, 4.4]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) pidx = pd.Index(["A", "B", "C", "D"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.holds_integer(), psidx.holds_integer()) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.holds_integer(), kidx.holds_integer()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "a")]) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer()) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.holds_integer(), kmidx.holds_integer()) pmidx = pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20, 1)]) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer()) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.holds_integer(), kmidx.holds_integer()) def test_abs(self): pidx = pd.Index([-2, -1, 0, 1]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(abs(pidx), abs(psidx)) - self.assert_eq(np.abs(pidx), np.abs(psidx)) + self.assert_eq(abs(pidx), abs(kidx)) + self.assert_eq(np.abs(pidx), np.abs(kidx)) - psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) + kidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): - abs(psidx) + abs(kidx) def test_hasnans(self): # BooleanType pidx = pd.Index([True, False, True, True]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, psidx.hasnans) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) pidx = pd.Index([True, False, np.nan, True]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, psidx.hasnans) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) # TimestampType pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) # Not supported for MultiIndex - psmidx = ps.Index([("a", 1), ("b", 2)]) - self.assertRaises(NotImplementedError, lambda: psmidx.hasnans()) + kmidx = ps.Index([("a", 1), ("b", 2)]) + self.assertRaises(NotImplementedError, lambda: kmidx.hasnans()) def test_intersection(self): pidx = pd.Index([1, 2, 3, 4], name="Koalas") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) # other = Index pidx_other = pd.Index([3, 4, 5, 6], name="Koalas") - psidx_other = ps.from_pandas(pidx_other) - self.assert_eq(pidx.intersection(pidx_other), psidx.intersection(psidx_other).sort_values()) + kidx_other = ps.from_pandas(pidx_other) + self.assert_eq(pidx.intersection(pidx_other), kidx.intersection(kidx_other).sort_values()) self.assert_eq( - (pidx + 1).intersection(pidx_other), (psidx + 1).intersection(psidx_other).sort_values() + (pidx + 1).intersection(pidx_other), (kidx + 1).intersection(kidx_other).sort_values() ) pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks") - psidx_other_different_name = ps.from_pandas(pidx_other_different_name) + kidx_other_different_name = ps.from_pandas(pidx_other_different_name) self.assert_eq( pidx.intersection(pidx_other_different_name), - psidx.intersection(psidx_other_different_name).sort_values(), + kidx.intersection(kidx_other_different_name).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pidx_other_different_name), - (psidx + 1).intersection(psidx_other_different_name).sort_values(), + (kidx + 1).intersection(kidx_other_different_name).sort_values(), ) pidx_other_from_frame = pd.DataFrame({"a": [3, 4, 5, 6]}).set_index("a").index - psidx_other_from_frame = ps.from_pandas(pidx_other_from_frame) + kidx_other_from_frame = ps.from_pandas(pidx_other_from_frame) self.assert_eq( pidx.intersection(pidx_other_from_frame), - psidx.intersection(psidx_other_from_frame).sort_values(), + kidx.intersection(kidx_other_from_frame).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pidx_other_from_frame), - (psidx + 1).intersection(psidx_other_from_frame).sort_values(), + (kidx + 1).intersection(kidx_other_from_frame).sort_values(), ) # other = MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - psidx.intersection(psmidx).sort_values(), - psidx._psdf.head(0).index.rename(None), + kidx.intersection(kmidx).sort_values(), + kidx._kdf.head(0).index.rename(None), almost=True, ) self.assert_eq( - (psidx + 1).intersection(psmidx).sort_values(), - psidx._psdf.head(0).index.rename(None), + (kidx + 1).intersection(kmidx).sort_values(), + kidx._kdf.head(0).index.rename(None), almost=True, ) else: self.assert_eq( - pidx.intersection(pmidx), psidx.intersection(psmidx).sort_values(), almost=True + pidx.intersection(pmidx), kidx.intersection(kmidx).sort_values(), almost=True ) self.assert_eq( (pidx + 1).intersection(pmidx), - (psidx + 1).intersection(psmidx).sort_values(), + (kidx + 1).intersection(kmidx).sort_values(), almost=True, ) # other = Series pser = pd.Series([3, 4, 5, 6]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): - self.assert_eq(psidx.intersection(psser).sort_values(), ps.Index([3, 4], name="Koalas")) + self.assert_eq(kidx.intersection(kser).sort_values(), ps.Index([3, 4], name="Koalas")) self.assert_eq( - (psidx + 1).intersection(psser).sort_values(), ps.Index([3, 4, 5], name="Koalas") + (kidx + 1).intersection(kser).sort_values(), ps.Index([3, 4, 5], name="Koalas") ) else: - self.assert_eq(pidx.intersection(pser), psidx.intersection(psser).sort_values()) + self.assert_eq(pidx.intersection(pser), kidx.intersection(kser).sort_values()) self.assert_eq( - (pidx + 1).intersection(pser), (psidx + 1).intersection(psser).sort_values() + (pidx + 1).intersection(pser), (kidx + 1).intersection(kser).sort_values() ) pser_different_name = pd.Series([3, 4, 5, 6], name="Databricks") - psser_different_name = ps.from_pandas(pser_different_name) + kser_different_name = ps.from_pandas(pser_different_name) if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - psidx.intersection(psser_different_name).sort_values(), + kidx.intersection(kser_different_name).sort_values(), ps.Index([3, 4], name="Koalas"), ) self.assert_eq( - (psidx + 1).intersection(psser_different_name).sort_values(), + (kidx + 1).intersection(kser_different_name).sort_values(), ps.Index([3, 4, 5], name="Koalas"), ) else: self.assert_eq( pidx.intersection(pser_different_name), - psidx.intersection(psser_different_name).sort_values(), + kidx.intersection(kser_different_name).sort_values(), ) self.assert_eq( (pidx + 1).intersection(pser_different_name), - (psidx + 1).intersection(psser_different_name).sort_values(), + (kidx + 1).intersection(kser_different_name).sort_values(), ) others = ([3, 4, 5, 6], (3, 4, 5, 6), {3: None, 4: None, 5: None, 6: None}) for other in others: if LooseVersion(pd.__version__) < LooseVersion("1.2.0"): self.assert_eq( - psidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas") + kidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas") ) self.assert_eq( - (psidx + 1).intersection(other).sort_values(), - ps.Index([3, 4, 5], name="Koalas"), + (kidx + 1).intersection(other).sort_values(), ps.Index([3, 4, 5], name="Koalas") ) else: - self.assert_eq(pidx.intersection(other), psidx.intersection(other).sort_values()) + self.assert_eq(pidx.intersection(other), kidx.intersection(other).sort_values()) self.assert_eq( - (pidx + 1).intersection(other), (psidx + 1).intersection(other).sort_values() + (pidx + 1).intersection(other), (kidx + 1).intersection(other).sort_values() ) # MultiIndex / other = Index self.assert_eq( - pmidx.intersection(pidx), psmidx.intersection(psidx).sort_values(), almost=True + pmidx.intersection(pidx), kmidx.intersection(kidx).sort_values(), almost=True ) self.assert_eq( pmidx.intersection(pidx_other_from_frame), - psmidx.intersection(psidx_other_from_frame).sort_values(), + kmidx.intersection(kidx_other_from_frame).sort_values(), almost=True, ) # MultiIndex / other = MultiIndex pmidx_other = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) - psmidx_other = ps.from_pandas(pmidx_other) + kmidx_other = ps.from_pandas(pmidx_other) self.assert_eq( - pmidx.intersection(pmidx_other), psmidx.intersection(psmidx_other).sort_values() + pmidx.intersection(pmidx_other), kmidx.intersection(kmidx_other).sort_values() ) # MultiIndex / other = list other = [("c", "z"), ("d", "w")] - self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) # MultiIndex / other = tuple other = (("c", "z"), ("d", "w")) - self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) # MultiIndex / other = dict other = {("c", "z"): None, ("d", "w"): None} - self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values()) + self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values()) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): - psidx.intersection(4) + kidx.intersection(4) with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - psmidx.intersection(4) + kmidx.intersection(4) with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): - psmidx.intersection(ps.Series([3, 4, 5, 6])) + kmidx.intersection(ps.Series([3, 4, 5, 6])) with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): - psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) + kidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): - psmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) + kmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) def test_item(self): pidx = pd.Index([10]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.item(), psidx.item()) + self.assert_eq(pidx.item(), kidx.item()) # with timestamp pidx = pd.Index([datetime(1990, 3, 9)]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.item(), psidx.item()) + self.assert_eq(pidx.item(), kidx.item()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.item(), psmidx.item()) + self.assert_eq(pmidx.item(), kmidx.item()) # MultiIndex with timestamp pmidx = pd.MultiIndex.from_tuples([(datetime(1990, 3, 9), datetime(2019, 8, 15))]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pidx.item(), psidx.item()) + self.assert_eq(pidx.item(), kidx.item()) err_msg = "can only convert an array of size 1 to a Python scalar" with self.assertRaisesRegex(ValueError, err_msg): @@ -1927,43 +1917,43 @@ def test_item(self): def test_inferred_type(self): # Integer pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, psidx.inferred_type) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, kidx.inferred_type) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, psidx.inferred_type) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, kidx.inferred_type) # String pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, psidx.inferred_type) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, kidx.inferred_type) # Boolean pidx = pd.Index([True, False, True, False]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.inferred_type, psidx.inferred_type) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.inferred_type, kidx.inferred_type) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.inferred_type, psmidx.inferred_type) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.inferred_type, kmidx.inferred_type) def test_multi_index_from_index(self): tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] pmidx = pd.Index(tuples) - psmidx = ps.Index(tuples) + kmidx = ps.Index(tuples) - self.assertTrue(isinstance(psmidx, ps.MultiIndex)) - self.assert_eq(pmidx, psmidx) + self.assertTrue(isinstance(kmidx, ps.MultiIndex)) + self.assert_eq(pmidx, kmidx) # Specify the `names` pmidx = pd.Index(tuples, names=["Hello", "Koalas"]) - psmidx = ps.Index(tuples, names=["Hello", "Koalas"]) + kmidx = ps.Index(tuples, names=["Hello", "Koalas"]) - self.assertTrue(isinstance(psmidx, ps.MultiIndex)) - self.assert_eq(pmidx, psmidx) + self.assertTrue(isinstance(kmidx, ps.MultiIndex)) + self.assert_eq(pmidx, kmidx) @unittest.skipIf( LooseVersion(pd.__version__) < LooseVersion("0.24"), @@ -1973,129 +1963,126 @@ def test_multiindex_from_frame(self): pdf = pd.DataFrame( [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"] ) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pidx = pd.MultiIndex.from_frame(pdf) - psidx = ps.MultiIndex.from_frame(psdf) + kidx = ps.MultiIndex.from_frame(kdf) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) # Specify `names` pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"]) - psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"]) - self.assert_eq(pidx, psidx) + kidx = ps.MultiIndex.from_frame(kdf, names=["state", "observation"]) + self.assert_eq(pidx, kidx) pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation")) - psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation")) - self.assert_eq(pidx, psidx) + kidx = ps.MultiIndex.from_frame(kdf, names=("state", "observation")) + self.assert_eq(pidx, kidx) # MultiIndex columns pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")]) pdf.columns = pidx - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pidx = pd.MultiIndex.from_frame(pdf) - psidx = ps.MultiIndex.from_frame(psdf) + kidx = ps.MultiIndex.from_frame(kdf) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) # tuples for names pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")]) - psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")]) + kidx = ps.MultiIndex.from_frame(kdf, names=[("a", "w"), ("b", "x")]) - self.assert_eq(pidx, psidx) + self.assert_eq(pidx, kidx) err_msg = "Input must be a DataFrame" with self.assertRaisesRegex(TypeError, err_msg): ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) - self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab")) + self.assertRaises(ValueError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) # non-string names self.assert_eq( - ps.MultiIndex.from_frame(psdf, names=[0, 1]), - pd.MultiIndex.from_frame(pdf, names=[0, 1]), + ps.MultiIndex.from_frame(kdf, names=[0, 1]), pd.MultiIndex.from_frame(pdf, names=[0, 1]) ) self.assert_eq( - ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]), + ps.MultiIndex.from_frame(kdf, names=[("x", 0), ("y", 1)]), pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]), ) pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]]) - psdf = ps.from_pandas(pdf) - self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf)) + kdf = ps.from_pandas(pdf) + self.assert_eq(ps.MultiIndex.from_frame(kdf), pd.MultiIndex.from_frame(pdf)) def test_is_type_compatible(self): data_types = ["integer", "floating", "string", "boolean"] # Integer pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) # String pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) # Boolean pidx = pd.Index([True, False, True, False]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) for data_type in data_types: - self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)) + self.assert_eq(pidx.is_type_compatible(data_type), kidx.is_type_compatible(data_type)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) for data_type in data_types: - self.assert_eq( - pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type) - ) + self.assert_eq(pmidx.is_type_compatible(data_type), kmidx.is_type_compatible(data_type)) def test_asi8(self): # Integer pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) - self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8) - self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8) - self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) + self.assert_eq(pidx.astype("int").asi8, kidx.astype("int").asi8) + self.assert_eq(pidx.astype("int16").asi8, kidx.astype("int16").asi8) + self.assert_eq(pidx.astype("int8").asi8, kidx.astype("int8").asi8) # Integer with missing value pidx = pd.Index([1, 2, None, 4, 5]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) # Datetime pidx = pd.date_range(end="1/1/2018", periods=3) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) # Floating pidx = pd.Index([1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) # String pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) # Boolean pidx = pd.Index([True, False, True, False]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.asi8, psidx.asi8) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.asi8, kidx.asi8) # MultiIndex pmidx = pd.MultiIndex.from_tuples([(1, 2)]) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.asi8, psmidx.asi8) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.asi8, kmidx.asi8) def test_index_is_unique(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] @@ -2104,9 +2091,9 @@ def test_index_is_unique(self): for idx, name, expected in zip(indexes, names, is_uniq): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name)) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.is_unique, expected) + self.assertEqual(kdf.index.is_unique, expected) def test_multiindex_is_unique(self): indexes = [ @@ -2119,175 +2106,175 @@ def test_multiindex_is_unique(self): for idx, expected in zip(indexes, is_uniq): pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assertEqual(psdf.index.is_unique, expected) + self.assertEqual(kdf.index.is_unique, expected) def test_view(self): pidx = pd.Index([1, 2, 3, 4], name="Koalas") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(pidx.view(), psidx.view()) + self.assert_eq(pidx.view(), kidx.view()) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.view(), psmidx.view()) + self.assert_eq(pmidx.view(), kmidx.view()) def test_insert(self): # Integer pidx = pd.Index([1, 2, 3], name="Koalas") - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, 100), psidx.insert(1, 100)) - self.assert_eq(pidx.insert(-1, 100), psidx.insert(-1, 100)) - self.assert_eq(pidx.insert(100, 100), psidx.insert(100, 100)) - self.assert_eq(pidx.insert(-100, 100), psidx.insert(-100, 100)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, 100), kidx.insert(1, 100)) + self.assert_eq(pidx.insert(-1, 100), kidx.insert(-1, 100)) + self.assert_eq(pidx.insert(100, 100), kidx.insert(100, 100)) + self.assert_eq(pidx.insert(-100, 100), kidx.insert(-100, 100)) # Floating pidx = pd.Index([1.0, 2.0, 3.0], name="Koalas") - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, 100.0), psidx.insert(1, 100.0)) - self.assert_eq(pidx.insert(-1, 100.0), psidx.insert(-1, 100.0)) - self.assert_eq(pidx.insert(100, 100.0), psidx.insert(100, 100.0)) - self.assert_eq(pidx.insert(-100, 100.0), psidx.insert(-100, 100.0)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, 100.0), kidx.insert(1, 100.0)) + self.assert_eq(pidx.insert(-1, 100.0), kidx.insert(-1, 100.0)) + self.assert_eq(pidx.insert(100, 100.0), kidx.insert(100, 100.0)) + self.assert_eq(pidx.insert(-100, 100.0), kidx.insert(-100, 100.0)) # String pidx = pd.Index(["a", "b", "c"], name="Koalas") - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, "x"), psidx.insert(1, "x")) - self.assert_eq(pidx.insert(-1, "x"), psidx.insert(-1, "x")) - self.assert_eq(pidx.insert(100, "x"), psidx.insert(100, "x")) - self.assert_eq(pidx.insert(-100, "x"), psidx.insert(-100, "x")) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, "x"), kidx.insert(1, "x")) + self.assert_eq(pidx.insert(-1, "x"), kidx.insert(-1, "x")) + self.assert_eq(pidx.insert(100, "x"), kidx.insert(100, "x")) + self.assert_eq(pidx.insert(-100, "x"), kidx.insert(-100, "x")) # Boolean pidx = pd.Index([True, False, True, False], name="Koalas") - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.insert(1, True), psidx.insert(1, True)) - self.assert_eq(pidx.insert(-1, True), psidx.insert(-1, True)) - self.assert_eq(pidx.insert(100, True), psidx.insert(100, True)) - self.assert_eq(pidx.insert(-100, True), psidx.insert(-100, True)) + kidx = ps.from_pandas(pidx) + self.assert_eq(pidx.insert(1, True), kidx.insert(1, True)) + self.assert_eq(pidx.insert(-1, True), kidx.insert(-1, True)) + self.assert_eq(pidx.insert(100, True), kidx.insert(100, True)) + self.assert_eq(pidx.insert(-100, True), kidx.insert(-100, True)) # MultiIndex pmidx = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y"), ("c", "z")], names=["Hello", "Koalas"] ) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(pmidx.insert(2, ("h", "j")), psmidx.insert(2, ("h", "j"))) - self.assert_eq(pmidx.insert(-1, ("h", "j")), psmidx.insert(-1, ("h", "j"))) + kmidx = ps.from_pandas(pmidx) + self.assert_eq(pmidx.insert(2, ("h", "j")), kmidx.insert(2, ("h", "j"))) + self.assert_eq(pmidx.insert(-1, ("h", "j")), kmidx.insert(-1, ("h", "j"))) err_msg = "index 4 is out of bounds for axis 0 with size 3" with self.assertRaisesRegex(IndexError, err_msg): - psmidx.insert(4, ("b", "y")) + kmidx.insert(4, ("b", "y")) def test_astype(self): pidx = pd.Index([10, 20, 15, 30, 45], name="x") - psidx = ps.Index(pidx) - - self.assert_eq(psidx.astype(int), pidx.astype(int)) - self.assert_eq(psidx.astype(np.int), pidx.astype(np.int)) - self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8)) - self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16)) - self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32)) - self.assert_eq(psidx.astype(np.int64), pidx.astype(np.int64)) - self.assert_eq(psidx.astype(np.byte), pidx.astype(np.byte)) - self.assert_eq(psidx.astype("int"), pidx.astype("int")) - self.assert_eq(psidx.astype("int8"), pidx.astype("int8")) - self.assert_eq(psidx.astype("int16"), pidx.astype("int16")) - self.assert_eq(psidx.astype("int32"), pidx.astype("int32")) - self.assert_eq(psidx.astype("int64"), pidx.astype("int64")) - self.assert_eq(psidx.astype("b"), pidx.astype("b")) - self.assert_eq(psidx.astype("byte"), pidx.astype("byte")) - self.assert_eq(psidx.astype("i"), pidx.astype("i")) - self.assert_eq(psidx.astype("long"), pidx.astype("long")) - self.assert_eq(psidx.astype("short"), pidx.astype("short")) - self.assert_eq(psidx.astype(np.float), pidx.astype(np.float)) - self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32)) - self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64)) - self.assert_eq(psidx.astype("float"), pidx.astype("float")) - self.assert_eq(psidx.astype("float32"), pidx.astype("float32")) - self.assert_eq(psidx.astype("float64"), pidx.astype("float64")) - self.assert_eq(psidx.astype("double"), pidx.astype("double")) - self.assert_eq(psidx.astype("f"), pidx.astype("f")) - self.assert_eq(psidx.astype(bool), pidx.astype(bool)) - self.assert_eq(psidx.astype("bool"), pidx.astype("bool")) - self.assert_eq(psidx.astype("?"), pidx.astype("?")) - self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_)) - self.assert_eq(psidx.astype("str"), pidx.astype("str")) - self.assert_eq(psidx.astype("U"), pidx.astype("U")) + kidx = ps.Index(pidx) + + self.assert_eq(kidx.astype(int), pidx.astype(int)) + self.assert_eq(kidx.astype(np.int), pidx.astype(np.int)) + self.assert_eq(kidx.astype(np.int8), pidx.astype(np.int8)) + self.assert_eq(kidx.astype(np.int16), pidx.astype(np.int16)) + self.assert_eq(kidx.astype(np.int32), pidx.astype(np.int32)) + self.assert_eq(kidx.astype(np.int64), pidx.astype(np.int64)) + self.assert_eq(kidx.astype(np.byte), pidx.astype(np.byte)) + self.assert_eq(kidx.astype("int"), pidx.astype("int")) + self.assert_eq(kidx.astype("int8"), pidx.astype("int8")) + self.assert_eq(kidx.astype("int16"), pidx.astype("int16")) + self.assert_eq(kidx.astype("int32"), pidx.astype("int32")) + self.assert_eq(kidx.astype("int64"), pidx.astype("int64")) + self.assert_eq(kidx.astype("b"), pidx.astype("b")) + self.assert_eq(kidx.astype("byte"), pidx.astype("byte")) + self.assert_eq(kidx.astype("i"), pidx.astype("i")) + self.assert_eq(kidx.astype("long"), pidx.astype("long")) + self.assert_eq(kidx.astype("short"), pidx.astype("short")) + self.assert_eq(kidx.astype(np.float), pidx.astype(np.float)) + self.assert_eq(kidx.astype(np.float32), pidx.astype(np.float32)) + self.assert_eq(kidx.astype(np.float64), pidx.astype(np.float64)) + self.assert_eq(kidx.astype("float"), pidx.astype("float")) + self.assert_eq(kidx.astype("float32"), pidx.astype("float32")) + self.assert_eq(kidx.astype("float64"), pidx.astype("float64")) + self.assert_eq(kidx.astype("double"), pidx.astype("double")) + self.assert_eq(kidx.astype("f"), pidx.astype("f")) + self.assert_eq(kidx.astype(bool), pidx.astype(bool)) + self.assert_eq(kidx.astype("bool"), pidx.astype("bool")) + self.assert_eq(kidx.astype("?"), pidx.astype("?")) + self.assert_eq(kidx.astype(np.unicode_), pidx.astype(np.unicode_)) + self.assert_eq(kidx.astype("str"), pidx.astype("str")) + self.assert_eq(kidx.astype("U"), pidx.astype("U")) pidx = pd.Index([10, 20, 15, 30, 45, None], name="x") - psidx = ps.Index(pidx) + kidx = ps.Index(pidx) pidx = pd.Index(["hi", "hi ", " ", " \t", "", None], name="x") - psidx = ps.Index(pidx) + kidx = ps.Index(pidx) - self.assert_eq(psidx.astype(bool), pidx.astype(bool)) - self.assert_eq(psidx.astype(str).to_numpy(), ["hi", "hi ", " ", " \t", "", "None"]) + self.assert_eq(kidx.astype(bool), pidx.astype(bool)) + self.assert_eq(kidx.astype(str).to_numpy(), ["hi", "hi ", " ", " \t", "", "None"]) pidx = pd.Index([True, False, None], name="x") - psidx = ps.Index(pidx) + kidx = ps.Index(pidx) - self.assert_eq(psidx.astype(bool), pidx.astype(bool)) + self.assert_eq(kidx.astype(bool), pidx.astype(bool)) pidx = pd.Index(["2020-10-27"], name="x") - psidx = ps.Index(pidx) + kidx = ps.Index(pidx) - self.assert_eq(psidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]")) + self.assert_eq(kidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]")) with self.assertRaisesRegex(TypeError, "not understood"): - psidx.astype("int63") + kidx.astype("int63") def test_to_list(self): # Index pidx = pd.Index([1, 2, 3, 4, 5]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) # MultiIndex tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] pmidx = pd.MultiIndex.from_tuples(tuples) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assert_eq(psidx.tolist(), pidx.tolist()) - self.assert_eq(psmidx.tolist(), pmidx.tolist()) + self.assert_eq(kidx.tolist(), pidx.tolist()) + self.assert_eq(kmidx.tolist(), pmidx.tolist()) def test_index_ops(self): pidx = pd.Index([1, 2, 3, 4, 5]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx) + self.assert_eq(kidx * 100 + kidx * 10 + kidx, pidx * 100 + pidx * 10 + pidx) pidx = pd.Index([1, 2, 3, 4, 5], name="a") - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) - self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx) + self.assert_eq(kidx * 100 + kidx * 10 + kidx, pidx * 100 + pidx * 10 + pidx) pdf = pd.DataFrame( index=pd.MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6)], names=["a", "b"]) ) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pidx1 = pdf.index.get_level_values(0) pidx2 = pdf.index.get_level_values(1) - psidx1 = psdf.index.get_level_values(0) - psidx2 = psdf.index.get_level_values(1) + kidx1 = kdf.index.get_level_values(0) + kidx2 = kdf.index.get_level_values(1) if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) + self.assert_eq(kidx1 * 10 + kidx2, pidx1 * 10 + pidx2) else: - self.assert_eq(psidx1 * 10 + psidx2, (pidx1 * 10 + pidx2).rename(None)) + self.assert_eq(kidx1 * 10 + kidx2, (pidx1 * 10 + pidx2).rename(None)) def test_factorize(self): pidx = pd.Index(["a", "b", "a", "b"]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) pcodes, puniques = pidx.factorize(sort=True) - kcodes, kuniques = psidx.factorize() + kcodes, kuniques = kidx.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - psmidx = ps.from_pandas(pmidx) + kmidx = ps.from_pandas(pmidx) - self.assertRaises(PandasNotImplementedError, lambda: psmidx.factorize()) + self.assertRaises(PandasNotImplementedError, lambda: kmidx.factorize()) if __name__ == "__main__": @@ -2295,8 +2282,7 @@ def test_factorize(self): try: import xmlrunner # type: ignore[import] - - testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 2618084f3ac92..eae26bc4c876b 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -51,33 +51,33 @@ def pser(self): return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") @property - def psser(self): + def kser(self): return ps.from_pandas(self.pser) def test_series_ops(self): pser = self.pser - psser = self.psser + kser = self.kser - self.assert_eq(psser + 1, pser + 1) - self.assert_eq(1 + psser, 1 + pser) - self.assert_eq(psser + 1 + 10 * psser, pser + 1 + 10 * pser) - self.assert_eq(psser + 1 + 10 * psser.index, pser + 1 + 10 * pser.index) - self.assert_eq(psser.index + 1 + 10 * psser, pser.index + 1 + 10 * pser) + self.assert_eq(kser + 1, pser + 1) + self.assert_eq(1 + kser, 1 + pser) + self.assert_eq(kser + 1 + 10 * kser, pser + 1 + 10 * pser) + self.assert_eq(kser + 1 + 10 * kser.index, pser + 1 + 10 * pser.index) + self.assert_eq(kser.index + 1 + 10 * kser, pser.index + 1 + 10 * pser) def test_series_tuple_name(self): pser = self.pser pser.name = ("x", "a") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser, pser) - self.assert_eq(psser.name, pser.name) + self.assert_eq(kser, pser) + self.assert_eq(kser.name, pser.name) pser.name = ("y", "z") - psser.name = ("y", "z") + kser.name = ("y", "z") - self.assert_eq(psser, pser) - self.assert_eq(psser.name, pser.name) + self.assert_eq(kser, pser) + self.assert_eq(kser.name, pser.name) def test_repr_cache_invalidation(self): # If there is any cache, inplace operations should invalidate it. @@ -86,12 +86,12 @@ def test_repr_cache_invalidation(self): s.rename("a", inplace=True) self.assertEqual(s.__repr__(), s.rename("a").__repr__()) - def _check_extension(self, psser, pser): + def _check_extension(self, kser, pser): if LooseVersion("1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.2.2"): - self.assert_eq(psser, pser, check_exact=False) - self.assertTrue(isinstance(psser.dtype, extension_dtypes)) + self.assert_eq(kser, pser, check_exact=False) + self.assertTrue(isinstance(kser.dtype, extension_dtypes)) else: - self.assert_eq(psser, pser) + self.assert_eq(kser, pser) @unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are not available") def test_extension_dtypes(self): @@ -101,11 +101,11 @@ def test_extension_dtypes(self): pd.Series([1, 2, None, 4], dtype="Int32"), pd.Series([1, 2, None, 4], dtype="Int64"), ]: - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self._check_extension(psser, pser) - self._check_extension(psser + F.lit(1).cast("byte"), pser + 1) - self._check_extension(psser + psser, pser + pser) + self._check_extension(kser, pser) + self._check_extension(kser + F.lit(1).cast("byte"), pser + 1) + self._check_extension(kser + kser, pser + pser) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -113,17 +113,17 @@ def test_extension_dtypes(self): def test_extension_object_dtypes(self): # string pser = pd.Series(["a", None, "c", "d"], dtype="string") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self._check_extension(psser, pser) + self._check_extension(kser, pser) # boolean pser = pd.Series([True, False, True, None], dtype="boolean") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self._check_extension(psser, pser) - self._check_extension(psser & psser, pser & pser) - self._check_extension(psser | psser, pser | pser) + self._check_extension(kser, pser) + self._check_extension(kser & kser, pser & pser) + self._check_extension(kser | kser, pser | pser) @unittest.skipIf( not extension_float_dtypes_available, "pandas extension float dtypes are not available" @@ -133,11 +133,11 @@ def test_extension_float_dtypes(self): pd.Series([1.0, 2.0, None, 4.0], dtype="Float32"), pd.Series([1.0, 2.0, None, 4.0], dtype="Float64"), ]: - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self._check_extension(psser, pser) - self._check_extension(psser + 1, pser + 1) - self._check_extension(psser + psser, pser + pser) + self._check_extension(kser, pser) + self._check_extension(kser + 1, pser + 1) + self._check_extension(kser + kser, pser + pser) def test_empty_series(self): pser_a = pd.Series([], dtype="i1") @@ -145,8 +145,8 @@ def test_empty_series(self): self.assert_eq(ps.from_pandas(pser_a), pser_a) - psser_b = ps.from_pandas(pser_b) - self.assert_eq(psser_b, pser_b) + kser_b = ps.from_pandas(pser_b) + self.assert_eq(kser_b, pser_b) with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): self.assert_eq(ps.from_pandas(pser_a), pser_a) @@ -158,165 +158,165 @@ def test_all_null_series(self): self.assert_eq(ps.from_pandas(pser_a), pser_a) - psser_b = ps.from_pandas(pser_b) - self.assert_eq(psser_b, pser_b) + kser_b = ps.from_pandas(pser_b) + self.assert_eq(kser_b, pser_b) with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): self.assert_eq(ps.from_pandas(pser_a), pser_a) self.assert_eq(ps.from_pandas(pser_b), pser_b) def test_head(self): - psser = self.psser + kser = self.kser pser = self.pser - self.assert_eq(psser.head(3), pser.head(3)) - self.assert_eq(psser.head(0), pser.head(0)) - self.assert_eq(psser.head(-3), pser.head(-3)) - self.assert_eq(psser.head(-10), pser.head(-10)) + self.assert_eq(kser.head(3), pser.head(3)) + self.assert_eq(kser.head(0), pser.head(0)) + self.assert_eq(kser.head(-3), pser.head(-3)) + self.assert_eq(kser.head(-10), pser.head(-10)) def test_last(self): with self.assertRaises(TypeError): - self.psser.last("1D") + self.kser.last("1D") index = pd.date_range("2018-04-09", periods=4, freq="2D") pser = pd.Series([1, 2, 3, 4], index=index) - psser = ps.from_pandas(pser) - self.assert_eq(psser.last("1D"), pser.last("1D")) + kser = ps.from_pandas(pser) + self.assert_eq(kser.last("1D"), pser.last("1D")) def test_first(self): with self.assertRaises(TypeError): - self.psser.first("1D") + self.kser.first("1D") index = pd.date_range("2018-04-09", periods=4, freq="2D") pser = pd.Series([1, 2, 3, 4], index=index) - psser = ps.from_pandas(pser) - self.assert_eq(psser.first("1D"), pser.first("1D")) + kser = ps.from_pandas(pser) + self.assert_eq(kser.first("1D"), pser.first("1D")) def test_rename(self): pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pser.name = "renamed" - psser.name = "renamed" - self.assertEqual(psser.name, "renamed") - self.assert_eq(psser, pser) + kser.name = "renamed" + self.assertEqual(kser.name, "renamed") + self.assert_eq(kser, pser) pser.name = None - psser.name = None - self.assertEqual(psser.name, None) - self.assert_eq(psser, pser) + kser.name = None + self.assertEqual(kser.name, None) + self.assert_eq(kser, pser) pidx = pser.index - psidx = psser.index + kidx = kser.index pidx.name = "renamed" - psidx.name = "renamed" - self.assertEqual(psidx.name, "renamed") - self.assert_eq(psidx, pidx) + kidx.name = "renamed" + self.assertEqual(kidx.name, "renamed") + self.assert_eq(kidx, pidx) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - psser.name = ["renamed"] + kser.name = ["renamed"] with self.assertRaisesRegex(TypeError, expected_error_message): - psser.name = ["0", "1"] + kser.name = ["0", "1"] with self.assertRaisesRegex(TypeError, expected_error_message): ps.Series([1, 2, 3], name=["0", "1"]) def test_rename_method(self): # Series name pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.rename("y"), pser.rename("y")) - self.assertEqual(psser.name, "x") # no mutation - self.assert_eq(psser.rename(), pser.rename()) + self.assert_eq(kser.rename("y"), pser.rename("y")) + self.assertEqual(kser.name, "x") # no mutation + self.assert_eq(kser.rename(), pser.rename()) - self.assert_eq((psser.rename("y") + 1).head(), (pser.rename("y") + 1).head()) + self.assert_eq((kser.rename("y") + 1).head(), (pser.rename("y") + 1).head()) - psser.rename("z", inplace=True) + kser.rename("z", inplace=True) pser.rename("z", inplace=True) - self.assertEqual(psser.name, "z") - self.assert_eq(psser, pser) + self.assertEqual(kser.name, "z") + self.assert_eq(kser, pser) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): - psser.rename(["0", "1"]) + kser.rename(["0", "1"]) # Series index # pser = pd.Series(['a', 'b', 'c', 'd', 'e', 'f', 'g'], name='x') - # psser = ps.from_pandas(s) + # kser = ps.from_pandas(s) # TODO: index - # res = psser.rename(lambda x: x ** 2) + # res = kser.rename(lambda x: x ** 2) # self.assert_eq(res, pser.rename(lambda x: x ** 2)) - # res = psser.rename(pser) + # res = kser.rename(pser) # self.assert_eq(res, pser.rename(pser)) - # res = psser.rename(psser) + # res = kser.rename(kser) # self.assert_eq(res, pser.rename(pser)) - # res = psser.rename(lambda x: x**2, inplace=True) - # self.assertis(res, psser) + # res = kser.rename(lambda x: x**2, inplace=True) + # self.assertis(res, kser) # s.rename(lambda x: x**2, inplace=True) - # self.assert_eq(psser, pser) + # self.assert_eq(kser, pser) def test_rename_axis(self): index = pd.Index(["A", "B", "C"], name="index") pser = pd.Series([1.0, 2.0, 3.0], index=index, name="name") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.rename_axis("index2").sort_index(), psser.rename_axis("index2").sort_index(), + pser.rename_axis("index2").sort_index(), kser.rename_axis("index2").sort_index(), ) self.assert_eq( (pser + 1).rename_axis("index2").sort_index(), - (psser + 1).rename_axis("index2").sort_index(), + (kser + 1).rename_axis("index2").sort_index(), ) pser2 = pser.copy() - psser2 = psser.copy() + kser2 = kser.copy() pser2.rename_axis("index2", inplace=True) - psser2.rename_axis("index2", inplace=True) - self.assert_eq(pser2.sort_index(), psser2.sort_index()) + kser2.rename_axis("index2", inplace=True) + self.assert_eq(pser2.sort_index(), kser2.sort_index()) - self.assertRaises(ValueError, lambda: psser.rename_axis(["index2", "index3"])) - self.assertRaises(TypeError, lambda: psser.rename_axis(mapper=["index2"], index=["index3"])) + self.assertRaises(ValueError, lambda: kser.rename_axis(["index2", "index3"])) + self.assertRaises(TypeError, lambda: kser.rename_axis(mapper=["index2"], index=["index3"])) # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): self.assert_eq( pser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), - psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), + kser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), ) self.assert_eq( pser.rename_axis(index=str.upper).sort_index(), - psser.rename_axis(index=str.upper).sort_index(), + kser.rename_axis(index=str.upper).sort_index(), ) else: - expected = psser + expected = kser expected.index.name = "index2" - result = psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index() + result = kser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index() self.assert_eq(expected, result) - expected = psser + expected = kser expected.index.name = "INDEX" - result = psser.rename_axis(index=str.upper).sort_index() + result = kser.rename_axis(index=str.upper).sort_index() self.assert_eq(expected, result) index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] ) pser = pd.Series([1.0, 2.0, 3.0], index=index, name="name") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pser.rename_axis(["index3", "index4"]).sort_index(), - psser.rename_axis(["index3", "index4"]).sort_index(), + kser.rename_axis(["index3", "index4"]).sort_index(), ) - self.assertRaises(ValueError, lambda: psser.rename_axis(["index3", "index4", "index5"])) + self.assertRaises(ValueError, lambda: kser.rename_axis(["index3", "index4", "index5"])) # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): @@ -324,25 +324,25 @@ def test_rename_axis(self): pser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index(), - psser.rename_axis( + kser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index(), ) self.assert_eq( pser.rename_axis(index=str.upper).sort_index(), - psser.rename_axis(index=str.upper).sort_index(), + kser.rename_axis(index=str.upper).sort_index(), ) else: - expected = psser + expected = kser expected.index.names = ["index3", "index4"] - result = psser.rename_axis( + result = kser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} ).sort_index() self.assert_eq(expected, result) expected.index.names = ["INDEX1", "INDEX2"] - result = psser.rename_axis(index=str.upper).sort_index() + result = kser.rename_axis(index=str.upper).sort_index() self.assert_eq(expected, result) def test_or(self): @@ -352,15 +352,15 @@ def test_or(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq(psdf["left"] | psdf["right"], pdf["left"] | pdf["right"]) - self.assert_eq(psdf["left"] | True, pdf["left"] | True) - self.assert_eq(psdf["left"] | False, pdf["left"] | False) - self.assert_eq(psdf["left"] | None, pdf["left"] | None) - self.assert_eq(True | psdf["right"], True | pdf["right"]) - self.assert_eq(False | psdf["right"], False | pdf["right"]) - self.assert_eq(None | psdf["right"], None | pdf["right"]) + self.assert_eq(kdf["left"] | kdf["right"], pdf["left"] | pdf["right"]) + self.assert_eq(kdf["left"] | True, pdf["left"] | True) + self.assert_eq(kdf["left"] | False, pdf["left"] | False) + self.assert_eq(kdf["left"] | None, pdf["left"] | None) + self.assert_eq(True | kdf["right"], True | pdf["right"]) + self.assert_eq(False | kdf["right"], False | pdf["right"]) + self.assert_eq(None | kdf["right"], None | pdf["right"]) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -372,15 +372,15 @@ def test_or_extenstion_dtypes(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ).astype("boolean") - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self._check_extension(psdf["left"] | psdf["right"], pdf["left"] | pdf["right"]) - self._check_extension(psdf["left"] | True, pdf["left"] | True) - self._check_extension(psdf["left"] | False, pdf["left"] | False) - self._check_extension(psdf["left"] | pd.NA, pdf["left"] | pd.NA) - self._check_extension(True | psdf["right"], True | pdf["right"]) - self._check_extension(False | psdf["right"], False | pdf["right"]) - self._check_extension(pd.NA | psdf["right"], pd.NA | pdf["right"]) + self._check_extension(kdf["left"] | kdf["right"], pdf["left"] | pdf["right"]) + self._check_extension(kdf["left"] | True, pdf["left"] | True) + self._check_extension(kdf["left"] | False, pdf["left"] | False) + self._check_extension(kdf["left"] | pd.NA, pdf["left"] | pd.NA) + self._check_extension(True | kdf["right"], True | pdf["right"]) + self._check_extension(False | kdf["right"], False | pdf["right"]) + self._check_extension(pd.NA | kdf["right"], pd.NA | pdf["right"]) def test_and(self): pdf = pd.DataFrame( @@ -389,15 +389,15 @@ def test_and(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq(psdf["left"] & psdf["right"], pdf["left"] & pdf["right"]) - self.assert_eq(psdf["left"] & True, pdf["left"] & True) - self.assert_eq(psdf["left"] & False, pdf["left"] & False) - self.assert_eq(psdf["left"] & None, pdf["left"] & None) - self.assert_eq(True & psdf["right"], True & pdf["right"]) - self.assert_eq(False & psdf["right"], False & pdf["right"]) - self.assert_eq(None & psdf["right"], None & pdf["right"]) + self.assert_eq(kdf["left"] & kdf["right"], pdf["left"] & pdf["right"]) + self.assert_eq(kdf["left"] & True, pdf["left"] & True) + self.assert_eq(kdf["left"] & False, pdf["left"] & False) + self.assert_eq(kdf["left"] & None, pdf["left"] & None) + self.assert_eq(True & kdf["right"], True & pdf["right"]) + self.assert_eq(False & kdf["right"], False & pdf["right"]) + self.assert_eq(None & kdf["right"], None & pdf["right"]) @unittest.skipIf( not extension_object_dtypes_available, "pandas extension object dtypes are not available" @@ -409,96 +409,96 @@ def test_and_extenstion_dtypes(self): "right": [True, False, False, True, True, False, np.nan, np.nan, np.nan], } ).astype("boolean") - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self._check_extension(psdf["left"] & psdf["right"], pdf["left"] & pdf["right"]) - self._check_extension(psdf["left"] & True, pdf["left"] & True) - self._check_extension(psdf["left"] & False, pdf["left"] & False) - self._check_extension(psdf["left"] & pd.NA, pdf["left"] & pd.NA) - self._check_extension(True & psdf["right"], True & pdf["right"]) - self._check_extension(False & psdf["right"], False & pdf["right"]) - self._check_extension(pd.NA & psdf["right"], pd.NA & pdf["right"]) + self._check_extension(kdf["left"] & kdf["right"], pdf["left"] & pdf["right"]) + self._check_extension(kdf["left"] & True, pdf["left"] & True) + self._check_extension(kdf["left"] & False, pdf["left"] & False) + self._check_extension(kdf["left"] & pd.NA, pdf["left"] & pd.NA) + self._check_extension(True & kdf["right"], True & pdf["right"]) + self._check_extension(False & kdf["right"], False & pdf["right"]) + self._check_extension(pd.NA & kdf["right"], pd.NA & pdf["right"]) def test_to_numpy(self): pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x") - psser = ps.from_pandas(pser) - self.assert_eq(psser.to_numpy(), pser.values) + kser = ps.from_pandas(pser) + self.assert_eq(kser.to_numpy(), pser.values) def test_isin(self): pser = pd.Series(["lama", "cow", "lama", "beetle", "lama", "hippo"], name="animal") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) - self.assert_eq(psser.isin(np.array(["cow", "lama"])), pser.isin(np.array(["cow", "lama"]))) - self.assert_eq(psser.isin({"cow"}), pser.isin({"cow"})) + self.assert_eq(kser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) + self.assert_eq(kser.isin(np.array(["cow", "lama"])), pser.isin(np.array(["cow", "lama"]))) + self.assert_eq(kser.isin({"cow"}), pser.isin({"cow"})) msg = "only list-like objects are allowed to be passed to isin()" with self.assertRaisesRegex(TypeError, msg): - psser.isin(1) + kser.isin(1) def test_drop_duplicates(self): pdf = pd.DataFrame({"animal": ["lama", "cow", "lama", "beetle", "lama", "hippo"]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.animal - psser = psdf.animal + kser = kdf.animal - self.assert_eq(psser.drop_duplicates().sort_index(), pser.drop_duplicates().sort_index()) + self.assert_eq(kser.drop_duplicates().sort_index(), pser.drop_duplicates().sort_index()) self.assert_eq( - psser.drop_duplicates(keep="last").sort_index(), + kser.drop_duplicates(keep="last").sort_index(), pser.drop_duplicates(keep="last").sort_index(), ) # inplace - psser.drop_duplicates(keep=False, inplace=True) + kser.drop_duplicates(keep=False, inplace=True) pser.drop_duplicates(keep=False, inplace=True) - self.assert_eq(psser.sort_index(), pser.sort_index()) - self.assert_eq(psdf, pdf) + self.assert_eq(kser.sort_index(), pser.sort_index()) + self.assert_eq(kdf, pdf) def test_reindex(self): index = ["A", "B", "C", "D", "E"] pser = pd.Series([1.0, 2.0, 3.0, 4.0, None], index=index, name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser, psser) + self.assert_eq(pser, kser) self.assert_eq( - pser.reindex(["A", "B"]).sort_index(), psser.reindex(["A", "B"]).sort_index(), + pser.reindex(["A", "B"]).sort_index(), kser.reindex(["A", "B"]).sort_index(), ) self.assert_eq( pser.reindex(["A", "B", "2", "3"]).sort_index(), - psser.reindex(["A", "B", "2", "3"]).sort_index(), + kser.reindex(["A", "B", "2", "3"]).sort_index(), ) self.assert_eq( pser.reindex(["A", "E", "2"], fill_value=0).sort_index(), - psser.reindex(["A", "E", "2"], fill_value=0).sort_index(), + kser.reindex(["A", "E", "2"], fill_value=0).sort_index(), ) - self.assertRaises(TypeError, lambda: psser.reindex(index=123)) + self.assertRaises(TypeError, lambda: kser.reindex(index=123)) def test_reindex_like(self): data = [1.0, 2.0, None] index = pd.Index(["A", "B", "C"], name="index1") pser = pd.Series(data=data, index=index, name="name1") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) # Reindexing single Index on single Index data2 = [3.0, None, 4.0] index2 = pd.Index(["A", "C", "D"], name="index2") pser2 = pd.Series(data=data2, index=index2, name="name2") - psser2 = ps.from_pandas(pser2) + kser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), ) self.assert_eq( (pser + 1).reindex_like(pser2).sort_index(), - (psser + 1).reindex_like(psser2).sort_index(), + (kser + 1).reindex_like(kser2).sort_index(), ) # Reindexing MultiIndex on single Index @@ -506,24 +506,24 @@ def test_reindex_like(self): [("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"] ) pser2 = pd.Series(data=data2, index=index2, name="name2") - psser2 = ps.from_pandas(pser2) + kser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), ) - self.assertRaises(TypeError, lambda: psser.reindex_like(index2)) - self.assertRaises(AssertionError, lambda: psser2.reindex_like(psser)) + self.assertRaises(TypeError, lambda: kser.reindex_like(index2)) + self.assertRaises(AssertionError, lambda: kser2.reindex_like(kser)) # Reindexing MultiIndex on MultiIndex index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] ) pser = pd.Series(data=data, index=index, name="name1") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(), ) # Reindexing with DataFrame @@ -531,88 +531,88 @@ def test_reindex_like(self): [("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"] ) pdf = pd.DataFrame(data=data, index=index2) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) self.assert_eq( - pser.reindex_like(pdf).sort_index(), psser.reindex_like(psdf).sort_index(), + pser.reindex_like(pdf).sort_index(), kser.reindex_like(kdf).sort_index(), ) def test_fillna(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.fillna(0), pser.fillna(0)) - self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) + self.assert_eq(kser.fillna(0), pser.fillna(0)) + self.assert_eq(kser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) - psser.fillna(0, inplace=True) + kser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) # test considering series does not have NA/NaN values - psser.fillna(0, inplace=True) + kser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(psser, pser) + self.assert_eq(kser, pser) - psser = psdf.x.rename("y") + kser = kdf.x.rename("y") pser = pdf.x.rename("y") - psser.fillna(0, inplace=True) + kser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(psser.head(), pser.head()) + self.assert_eq(kser.head(), pser.head()) pser = pd.Series([1, 2, 3, 4, 5, 6], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pser.loc[3] = np.nan - psser.loc[3] = np.nan + kser.loc[3] = np.nan - self.assert_eq(psser.fillna(0), pser.fillna(0)) - self.assert_eq(psser.fillna(method="ffill"), pser.fillna(method="ffill")) - self.assert_eq(psser.fillna(method="bfill"), pser.fillna(method="bfill")) + self.assert_eq(kser.fillna(0), pser.fillna(0)) + self.assert_eq(kser.fillna(method="ffill"), pser.fillna(method="ffill")) + self.assert_eq(kser.fillna(method="bfill"), pser.fillna(method="bfill")) # inplace fillna on non-nullable column pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.b - psser = psdf.b + kser = kdf.b - self.assert_eq(psser.fillna(0), pser.fillna(0)) - self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) + self.assert_eq(kser.fillna(0), pser.fillna(0)) + self.assert_eq(kser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0)) - psser.fillna(0, inplace=True) + kser.fillna(0, inplace=True) pser.fillna(0, inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) def test_dropna(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.dropna(), pser.dropna()) + self.assert_eq(kser.dropna(), pser.dropna()) pser.dropna(inplace=True) - psser.dropna(inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + kser.dropna(inplace=True) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) def test_nunique(self): pser = pd.Series([1, 2, 1, np.nan]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) # Assert NaNs are dropped by default - nunique_result = psser.nunique() + nunique_result = kser.nunique() self.assertEqual(nunique_result, 2) self.assert_eq(nunique_result, pser.nunique()) # Assert including NaN values - nunique_result = psser.nunique(dropna=False) + nunique_result = kser.nunique(dropna=False) self.assertEqual(nunique_result, 3) self.assert_eq(nunique_result, pser.nunique(dropna=False)) @@ -627,105 +627,105 @@ def test_value_counts(self): index=[1, 2, 1, 3, 3, np.nan, 1, 4, 2, np.nan, 3, np.nan, 3, 1, 3], name="x", ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) exp = pser.value_counts() - res = psser.value_counts() + res = kser.value_counts() self.assertEqual(res.name, exp.name) self.assert_eq(res, exp) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - psser.value_counts(normalize=True, dropna=False), + kser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.value_counts(ascending=True, dropna=False), + kser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) with self.assertRaisesRegex( NotImplementedError, "value_counts currently does not support bins" ): - psser.value_counts(bins=3) + kser.value_counts(bins=3) pser.name = "index" - psser.name = "index" - self.assert_eq(psser.value_counts(), pser.value_counts()) + kser.name = "index" + self.assert_eq(kser.value_counts(), pser.value_counts()) # Series from DataFrame pdf = pd.DataFrame({"a": [2, 2, 3], "b": [None, 1, None]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq(psdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True)) - self.assert_eq(psdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True)) + self.assert_eq(kdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True)) + self.assert_eq(kdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True)) self.assert_eq( - psdf.a.value_counts(normalize=True, dropna=False), + kdf.a.value_counts(normalize=True, dropna=False), pdf.a.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psdf.a.value_counts(ascending=True, dropna=False), + kdf.a.value_counts(ascending=True, dropna=False), pdf.a.value_counts(ascending=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) # Series with NaN index pser = pd.Series([3, 2, 3, 1, 2, 3], index=[2.0, None, 5.0, 5.0, None, 5.0]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - psser.value_counts(normalize=True, dropna=False), + kser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.value_counts(ascending=True, dropna=False), + kser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True) ) self.assert_eq( - psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True) ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), ) @@ -733,37 +733,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - psser.value_counts(normalize=True, dropna=False), + kser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.value_counts(ascending=True, dropna=False), + kser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - psser.index.value_counts(normalize=True), + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True), + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -772,37 +772,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", None), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - psser.value_counts(normalize=True, dropna=False), + kser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.value_counts(ascending=True, dropna=False), + kser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - psser.index.value_counts(normalize=True), + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True), + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -813,37 +813,37 @@ def test_value_counts(self): pser.index = pd.MultiIndex.from_tuples( [("x", "a"), None, ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq(kser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(kser.value_counts(ascending=True), pser.value_counts(ascending=True)) self.assert_eq( - psser.value_counts(normalize=True, dropna=False), + kser.value_counts(normalize=True, dropna=False), pser.value_counts(normalize=True, dropna=False), ) self.assert_eq( - psser.value_counts(ascending=True, dropna=False), + kser.value_counts(ascending=True, dropna=False), pser.value_counts(ascending=True, dropna=False), ) # FIXME: MultiIndex.value_counts returns wrong indices. self.assert_eq( - psser.index.value_counts(normalize=True), + kser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True), + kser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True), almost=True, ) self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), + kser.index.value_counts(normalize=True, dropna=False), pser.index.value_counts(normalize=True, dropna=False), almost=True, ) self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), + kser.index.value_counts(ascending=True, dropna=False), pser.index.value_counts(ascending=True, dropna=False), almost=True, ) @@ -851,31 +851,31 @@ def test_value_counts(self): def test_nsmallest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] pser = pd.Series(sample_lst, name="x") - psser = ps.Series(sample_lst, name="x") - self.assert_eq(psser.nsmallest(n=3), pser.nsmallest(n=3)) - self.assert_eq(psser.nsmallest(), pser.nsmallest()) - self.assert_eq((psser + 1).nsmallest(), (pser + 1).nsmallest()) + kser = ps.Series(sample_lst, name="x") + self.assert_eq(kser.nsmallest(n=3), pser.nsmallest(n=3)) + self.assert_eq(kser.nsmallest(), pser.nsmallest()) + self.assert_eq((kser + 1).nsmallest(), (pser + 1).nsmallest()) def test_nlargest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] pser = pd.Series(sample_lst, name="x") - psser = ps.Series(sample_lst, name="x") - self.assert_eq(psser.nlargest(n=3), pser.nlargest(n=3)) - self.assert_eq(psser.nlargest(), pser.nlargest()) - self.assert_eq((psser + 1).nlargest(), (pser + 1).nlargest()) + kser = ps.Series(sample_lst, name="x") + self.assert_eq(kser.nlargest(n=3), pser.nlargest(n=3)) + self.assert_eq(kser.nlargest(), pser.nlargest()) + self.assert_eq((kser + 1).nlargest(), (pser + 1).nlargest()) def test_isnull(self): pser = pd.Series([1, 2, 3, 4, np.nan, 6], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.notnull(), pser.notnull()) - self.assert_eq(psser.isnull(), pser.isnull()) + self.assert_eq(kser.notnull(), pser.notnull()) + self.assert_eq(kser.isnull(), pser.isnull()) pser = self.pser - psser = self.psser + kser = self.kser - self.assert_eq(psser.notnull(), pser.notnull()) - self.assert_eq(psser.isnull(), pser.isnull()) + self.assert_eq(kser.notnull(), pser.notnull()) + self.assert_eq(kser.isnull(), pser.isnull()) def test_all(self): for pser in [ @@ -888,18 +888,18 @@ def test_all(self): pd.Series([], name="x"), pd.Series([np.nan], name="x"), ]: - psser = ps.from_pandas(pser) - self.assert_eq(psser.all(), pser.all()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.all(), pser.all()) pser = pd.Series([1, 2, 3, 4], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq((psser % 2 == 0).all(), (pser % 2 == 0).all()) + self.assert_eq((kser % 2 == 0).all(), (pser % 2 == 0).all()) with self.assertRaisesRegex( NotImplementedError, 'axis should be either 0 or "index" currently.' ): - psser.all(axis=1) + kser.all(axis=1) def test_any(self): for pser in [ @@ -912,153 +912,151 @@ def test_any(self): pd.Series([], name="x"), pd.Series([np.nan], name="x"), ]: - psser = ps.from_pandas(pser) - self.assert_eq(psser.any(), pser.any()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.any(), pser.any()) pser = pd.Series([1, 2, 3, 4], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq((psser % 2 == 0).any(), (pser % 2 == 0).any()) + self.assert_eq((kser % 2 == 0).any(), (pser % 2 == 0).any()) with self.assertRaisesRegex( NotImplementedError, 'axis should be either 0 or "index" currently.' ): - psser.any(axis=1) + kser.any(axis=1) def test_reset_index(self): pdf = pd.DataFrame({"foo": [1, 2, 3, 4]}, index=pd.Index(["a", "b", "c", "d"], name="idx")) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.foo - psser = psdf.foo + kser = kdf.foo - self.assert_eq(psser.reset_index(), pser.reset_index()) - self.assert_eq(psser.reset_index(name="values"), pser.reset_index(name="values")) - self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True)) + self.assert_eq(kser.reset_index(), pser.reset_index()) + self.assert_eq(kser.reset_index(name="values"), pser.reset_index(name="values")) + self.assert_eq(kser.reset_index(drop=True), pser.reset_index(drop=True)) # inplace - psser.reset_index(drop=True, inplace=True) + kser.reset_index(drop=True, inplace=True) pser.reset_index(drop=True, inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) def test_reset_index_with_default_index_types(self): pser = pd.Series([1, 2, 3], name="0", index=np.random.rand(3)) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) with ps.option_context("compute.default_index_type", "sequence"): - self.assert_eq(psser.reset_index(), pser.reset_index()) + self.assert_eq(kser.reset_index(), pser.reset_index()) with ps.option_context("compute.default_index_type", "distributed-sequence"): # the order might be changed. - self.assert_eq(psser.reset_index().sort_index(), pser.reset_index()) + self.assert_eq(kser.reset_index().sort_index(), pser.reset_index()) with ps.option_context("compute.default_index_type", "distributed"): # the index is different. self.assert_eq( - psser.reset_index().to_pandas().reset_index(drop=True), pser.reset_index() + kser.reset_index().to_pandas().reset_index(drop=True), pser.reset_index() ) def test_index_to_series_reset_index(self): - def check(psser, pser): - self.assert_eq(psser.reset_index(), pser.reset_index()) - self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True)) + def check(kser, pser): + self.assert_eq(kser.reset_index(), pser.reset_index()) + self.assert_eq(kser.reset_index(drop=True), pser.reset_index(drop=True)) pser.reset_index(drop=True, inplace=True) - psser.reset_index(drop=True, inplace=True) - self.assert_eq(psser, pser) + kser.reset_index(drop=True, inplace=True) + self.assert_eq(kser, pser) pdf = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, index=np.random.rand(9), ) - psdf = ps.from_pandas(pdf) - check(psdf.index.to_series(), pdf.index.to_series()) - check(psdf.index.to_series(name="a"), pdf.index.to_series(name="a")) - check(psdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a"))) + kdf = ps.from_pandas(pdf) + check(kdf.index.to_series(), pdf.index.to_series()) + check(kdf.index.to_series(name="a"), pdf.index.to_series(name="a")) + check(kdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a"))) def test_sort_values(self): pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.sort_values(), pser.sort_values()) - self.assert_eq(psser.sort_values(ascending=False), pser.sort_values(ascending=False)) - self.assert_eq( - psser.sort_values(na_position="first"), pser.sort_values(na_position="first") - ) + self.assert_eq(kser.sort_values(), pser.sort_values()) + self.assert_eq(kser.sort_values(ascending=False), pser.sort_values(ascending=False)) + self.assert_eq(kser.sort_values(na_position="first"), pser.sort_values(na_position="first")) - self.assertRaises(ValueError, lambda: psser.sort_values(na_position="invalid")) + self.assertRaises(ValueError, lambda: kser.sort_values(na_position="invalid")) # inplace # pandas raises an exception when the Series is derived from DataFrame - psser.sort_values(inplace=True) - self.assert_eq(psser, pser.sort_values()) - self.assert_eq(psdf, pdf) + kser.sort_values(inplace=True) + self.assert_eq(kser, pser.sort_values()) + self.assert_eq(kdf, pdf) pser = pdf.x.copy() - psser = psdf.x.copy() + kser = kdf.x.copy() - psser.sort_values(inplace=True) + kser.sort_values(inplace=True) pser.sort_values(inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) def test_sort_index(self): pdf = pd.DataFrame({"x": [2, 1, np.nan]}, index=["b", "a", np.nan]) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x # Assert invalid parameters - self.assertRaises(NotImplementedError, lambda: psser.sort_index(axis=1)) - self.assertRaises(NotImplementedError, lambda: psser.sort_index(kind="mergesort")) - self.assertRaises(ValueError, lambda: psser.sort_index(na_position="invalid")) + self.assertRaises(NotImplementedError, lambda: kser.sort_index(axis=1)) + self.assertRaises(NotImplementedError, lambda: kser.sort_index(kind="mergesort")) + self.assertRaises(ValueError, lambda: kser.sort_index(na_position="invalid")) # Assert default behavior without parameters - self.assert_eq(psser.sort_index(), pser.sort_index()) + self.assert_eq(kser.sort_index(), pser.sort_index()) # Assert sorting descending - self.assert_eq(psser.sort_index(ascending=False), pser.sort_index(ascending=False)) + self.assert_eq(kser.sort_index(ascending=False), pser.sort_index(ascending=False)) # Assert sorting NA indices first - self.assert_eq(psser.sort_index(na_position="first"), pser.sort_index(na_position="first")) + self.assert_eq(kser.sort_index(na_position="first"), pser.sort_index(na_position="first")) # Assert sorting inplace # pandas sorts pdf.x by the index and update the column only # when the Series is derived from DataFrame. - psser.sort_index(inplace=True) - self.assert_eq(psser, pser.sort_index()) - self.assert_eq(psdf, pdf) + kser.sort_index(inplace=True) + self.assert_eq(kser, pser.sort_index()) + self.assert_eq(kdf, pdf) pser = pdf.x.copy() - psser = psdf.x.copy() + kser = kdf.x.copy() - psser.sort_index(inplace=True) + kser.sort_index(inplace=True) pser.sort_index(inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) # Assert multi-indices pser = pd.Series(range(4), index=[["b", "b", "a", "a"], [1, 0, 1, 0]], name="0") - psser = ps.from_pandas(pser) - self.assert_eq(psser.sort_index(), pser.sort_index()) - self.assert_eq(psser.sort_index(level=[1, 0]), pser.sort_index(level=[1, 0])) + kser = ps.from_pandas(pser) + self.assert_eq(kser.sort_index(), pser.sort_index()) + self.assert_eq(kser.sort_index(level=[1, 0]), pser.sort_index(level=[1, 0])) - self.assert_eq(psser.reset_index().sort_index(), pser.reset_index().sort_index()) + self.assert_eq(kser.reset_index().sort_index(), pser.reset_index().sort_index()) def test_to_datetime(self): pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pd.to_datetime(pser, infer_datetime_format=True), - ps.to_datetime(psser, infer_datetime_format=True), + ps.to_datetime(kser, infer_datetime_format=True), ) def test_missing(self): - psser = self.psser + kser = self.kser missing_functions = inspect.getmembers(MissingPandasLikeSeries, inspect.isfunction) unsupported_functions = [ @@ -1069,7 +1067,7 @@ def test_missing(self): PandasNotImplementedError, "method.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psser, name)() + getattr(kser, name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" @@ -1078,7 +1076,7 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "method.*Series.*{}.*is deprecated".format(name) ): - getattr(psser, name)() + getattr(kser, name)() missing_properties = inspect.getmembers( MissingPandasLikeSeries, lambda o: isinstance(o, property) @@ -1093,7 +1091,7 @@ def test_missing(self): PandasNotImplementedError, "property.*Series.*{}.*not implemented( yet\\.|\\. .+)".format(name), ): - getattr(psser, name) + getattr(kser, name) deprecated_properties = [ name for (name, type_) in missing_properties @@ -1103,271 +1101,267 @@ def test_missing(self): with self.assertRaisesRegex( PandasNotImplementedError, "property.*Series.*{}.*is deprecated".format(name) ): - getattr(psser, name) + getattr(kser, name) def test_clip(self): pser = pd.Series([0, 2, 4], index=np.random.rand(3)) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(TypeError, msg=msg): - psser.clip(lower=[1]) - with self.assertRaises(TypeError, msg=msg): - psser.clip(upper=[1]) + with self.assertRaises(ValueError, msg=msg): + kser.clip(lower=[1]) + with self.assertRaises(ValueError, msg=msg): + kser.clip(upper=[1]) # Assert no lower or upper - self.assert_eq(psser.clip(), pser.clip()) + self.assert_eq(kser.clip(), pser.clip()) # Assert lower only - self.assert_eq(psser.clip(1), pser.clip(1)) + self.assert_eq(kser.clip(1), pser.clip(1)) # Assert upper only - self.assert_eq(psser.clip(upper=3), pser.clip(upper=3)) + self.assert_eq(kser.clip(upper=3), pser.clip(upper=3)) # Assert lower and upper - self.assert_eq(psser.clip(1, 3), pser.clip(1, 3)) + self.assert_eq(kser.clip(1, 3), pser.clip(1, 3)) # Assert behavior on string values - str_psser = ps.Series(["a", "b", "c"]) - self.assert_eq(str_psser.clip(1, 3), str_psser) + str_kser = ps.Series(["a", "b", "c"]) + self.assert_eq(str_kser.clip(1, 3), str_kser) def test_compare(self): if LooseVersion(pd.__version__) >= LooseVersion("1.1"): pser = pd.Series([1, 2]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - res_psdf = psser.compare(psser) - self.assertTrue(res_psdf.empty) - self.assert_eq(res_psdf.columns, pd.Index(["self", "other"])) + res_kdf = kser.compare(kser) + self.assertTrue(res_kdf.empty) + self.assert_eq(res_kdf.columns, pd.Index(["self", "other"])) - self.assert_eq( - pser.compare(pser + 1).sort_index(), psser.compare(psser + 1).sort_index() - ) + self.assert_eq(pser.compare(pser + 1).sort_index(), kser.compare(kser + 1).sort_index()) pser = pd.Series([1, 2], index=["x", "y"]) - psser = ps.from_pandas(pser) - self.assert_eq( - pser.compare(pser + 1).sort_index(), psser.compare(psser + 1).sort_index() - ) + kser = ps.from_pandas(pser) + self.assert_eq(pser.compare(pser + 1).sort_index(), kser.compare(kser + 1).sort_index()) else: - psser = ps.Series([1, 2]) - res_psdf = psser.compare(psser) - self.assertTrue(res_psdf.empty) - self.assert_eq(res_psdf.columns, pd.Index(["self", "other"])) + kser = ps.Series([1, 2]) + res_kdf = kser.compare(kser) + self.assertTrue(res_kdf.empty) + self.assert_eq(res_kdf.columns, pd.Index(["self", "other"])) expected = ps.DataFrame([[1, 2], [2, 3]], columns=["self", "other"]) - self.assert_eq(expected, psser.compare(psser + 1).sort_index()) + self.assert_eq(expected, kser.compare(kser + 1).sort_index()) - psser = ps.Series([1, 2], index=["x", "y"]) + kser = ps.Series([1, 2], index=["x", "y"]) expected = ps.DataFrame([[1, 2], [2, 3]], index=["x", "y"], columns=["self", "other"]) - self.assert_eq(expected, psser.compare(psser + 1).sort_index()) + self.assert_eq(expected, kser.compare(kser + 1).sort_index()) def test_is_unique(self): # We can't use pandas' is_unique for comparison. pandas 0.23 ignores None pser = pd.Series([1, 2, 2, None, None]) - psser = ps.from_pandas(pser) - self.assertEqual(False, psser.is_unique) - self.assertEqual(False, (psser + 1).is_unique) + kser = ps.from_pandas(pser) + self.assertEqual(False, kser.is_unique) + self.assertEqual(False, (kser + 1).is_unique) pser = pd.Series([1, None, None]) - psser = ps.from_pandas(pser) - self.assertEqual(False, psser.is_unique) - self.assertEqual(False, (psser + 1).is_unique) + kser = ps.from_pandas(pser) + self.assertEqual(False, kser.is_unique) + self.assertEqual(False, (kser + 1).is_unique) pser = pd.Series([1]) - psser = ps.from_pandas(pser) - self.assertEqual(pser.is_unique, psser.is_unique) - self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique) + kser = ps.from_pandas(pser) + self.assertEqual(pser.is_unique, kser.is_unique) + self.assertEqual((pser + 1).is_unique, (kser + 1).is_unique) pser = pd.Series([1, 1, 1]) - psser = ps.from_pandas(pser) - self.assertEqual(pser.is_unique, psser.is_unique) - self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique) + kser = ps.from_pandas(pser) + self.assertEqual(pser.is_unique, kser.is_unique) + self.assertEqual((pser + 1).is_unique, (kser + 1).is_unique) def test_to_list(self): - self.assert_eq(self.psser.tolist(), self.pser.tolist()) + self.assert_eq(self.kser.tolist(), self.pser.tolist()) def test_append(self): pser1 = pd.Series([1, 2, 3], name="0") pser2 = pd.Series([4, 5, 6], name="0") pser3 = pd.Series([4, 5, 6], index=[3, 4, 5], name="0") - psser1 = ps.from_pandas(pser1) - psser2 = ps.from_pandas(pser2) - psser3 = ps.from_pandas(pser3) + kser1 = ps.from_pandas(pser1) + kser2 = ps.from_pandas(pser2) + kser3 = ps.from_pandas(pser3) - self.assert_eq(psser1.append(psser2), pser1.append(pser2)) - self.assert_eq(psser1.append(psser3), pser1.append(pser3)) + self.assert_eq(kser1.append(kser2), pser1.append(pser2)) + self.assert_eq(kser1.append(kser3), pser1.append(pser3)) self.assert_eq( - psser1.append(psser2, ignore_index=True), pser1.append(pser2, ignore_index=True) + kser1.append(kser2, ignore_index=True), pser1.append(pser2, ignore_index=True) ) - psser1.append(psser3, verify_integrity=True) + kser1.append(kser3, verify_integrity=True) msg = "Indices have overlapping values" with self.assertRaises(ValueError, msg=msg): - psser1.append(psser2, verify_integrity=True) + kser1.append(kser2, verify_integrity=True) def test_map(self): pser = pd.Series(["cat", "dog", None, "rabbit"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) # Currently Koalas doesn't return NaN as pandas does. - self.assert_eq(psser.map({}), pser.map({}).replace({pd.np.nan: None})) + self.assert_eq(kser.map({}), pser.map({}).replace({pd.np.nan: None})) d = defaultdict(lambda: "abc") - self.assertTrue("abc" in repr(psser.map(d))) - self.assert_eq(psser.map(d), pser.map(d)) + self.assertTrue("abc" in repr(kser.map(d))) + self.assert_eq(kser.map(d), pser.map(d)) def tomorrow(date) -> datetime: return date + timedelta(days=1) pser = pd.Series([datetime(2019, 10, 24)]) - psser = ps.from_pandas(pser) - self.assert_eq(psser.map(tomorrow), pser.map(tomorrow)) + kser = ps.from_pandas(pser) + self.assert_eq(kser.map(tomorrow), pser.map(tomorrow)) def test_add_prefix(self): pser = pd.Series([1, 2, 3, 4], name="0") - psser = ps.from_pandas(pser) - self.assert_eq(pser.add_prefix("item_"), psser.add_prefix("item_")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) pser = pd.Series( [1, 2, 3], name="0", index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.add_prefix("item_"), psser.add_prefix("item_")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.add_prefix("item_"), kser.add_prefix("item_")) def test_add_suffix(self): pser = pd.Series([1, 2, 3, 4], name="0") - psser = ps.from_pandas(pser) - self.assert_eq(pser.add_suffix("_item"), psser.add_suffix("_item")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) pser = pd.Series( [1, 2, 3], name="0", index=pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("B", "X")]), ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.add_suffix("_item"), psser.add_suffix("_item")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.add_suffix("_item"), kser.add_suffix("_item")) def test_cummin(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cummin(), psser.cummin()) - self.assert_eq(pser.cummin(skipna=False), psser.cummin(skipna=False)) - self.assert_eq(pser.cummin().sum(), psser.cummin().sum()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cummin(), kser.cummin()) + self.assert_eq(pser.cummin(skipna=False), kser.cummin(skipna=False)) + self.assert_eq(pser.cummin().sum(), kser.cummin().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - psser = ps.from_pandas(pser) - self.assert_eq(pser.cummin(), psser.cummin()) - self.assert_eq(pser.cummin(skipna=False), psser.cummin(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cummin(), kser.cummin()) + self.assert_eq(pser.cummin(skipna=False), kser.cummin(skipna=False)) def test_cummax(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cummax(), psser.cummax()) - self.assert_eq(pser.cummax(skipna=False), psser.cummax(skipna=False)) - self.assert_eq(pser.cummax().sum(), psser.cummax().sum()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cummax(), kser.cummax()) + self.assert_eq(pser.cummax(skipna=False), kser.cummax(skipna=False)) + self.assert_eq(pser.cummax().sum(), kser.cummax().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - psser = ps.from_pandas(pser) - self.assert_eq(pser.cummax(), psser.cummax()) - self.assert_eq(pser.cummax(skipna=False), psser.cummax(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cummax(), kser.cummax()) + self.assert_eq(pser.cummax(skipna=False), kser.cummax(skipna=False)) def test_cumsum(self): pser = pd.Series([1.0, None, 0.0, 4.0, 9.0]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum(), psser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False), psser.cumsum(skipna=False)) - self.assert_eq(pser.cumsum().sum(), psser.cumsum().sum()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum(), kser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False), kser.cumsum(skipna=False)) + self.assert_eq(pser.cumsum().sum(), kser.cumsum().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum(), psser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False), psser.cumsum(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum(), kser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False), kser.cumsum(skipna=False)) # bool pser = pd.Series([True, True, False, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumsum().astype(int), psser.cumsum()) - self.assert_eq(pser.cumsum(skipna=False).astype(int), psser.cumsum(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumsum().astype(int), kser.cumsum()) + self.assert_eq(pser.cumsum(skipna=False).astype(int), kser.cumsum(skipna=False)) def test_cumprod(self): pser = pd.Series([1.0, None, 1.0, 4.0, 9.0]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) - self.assert_eq(pser.cumprod().sum(), psser.cumprod().sum()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) + self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum()) # with integer type pser = pd.Series([1, 10, 1, 4, 9]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) - self.assert_eq(pser.cumprod().sum(), psser.cumprod().sum()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) + self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum()) # with reversed index pser.index = [4, 3, 2, 1, 0] - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) # including zero pser = pd.Series([1, 2, 0, 3]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) # including negative values pser = pd.Series([1, -1, -2]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False), psser.cumprod(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False)) # bool pser = pd.Series([True, True, False, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.cumprod(), psser.cumprod()) - self.assert_eq(pser.cumprod(skipna=False).astype(int), psser.cumprod(skipna=False)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.cumprod(), kser.cumprod()) + self.assert_eq(pser.cumprod(skipna=False).astype(int), kser.cumprod(skipna=False)) def test_median(self): - with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") def test_rank(self): pser = pd.Series([1, 2, 3, 1], name="x") - psser = ps.from_pandas(pser) - self.assert_eq(pser.rank(), psser.rank().sort_index()) - self.assert_eq(pser.rank().sum(), psser.rank().sum()) - self.assert_eq(pser.rank(ascending=False), psser.rank(ascending=False).sort_index()) - self.assert_eq(pser.rank(method="min"), psser.rank(method="min").sort_index()) - self.assert_eq(pser.rank(method="max"), psser.rank(method="max").sort_index()) - self.assert_eq(pser.rank(method="first"), psser.rank(method="first").sort_index()) - self.assert_eq(pser.rank(method="dense"), psser.rank(method="dense").sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.rank(), kser.rank().sort_index()) + self.assert_eq(pser.rank().sum(), kser.rank().sum()) + self.assert_eq(pser.rank(ascending=False), kser.rank(ascending=False).sort_index()) + self.assert_eq(pser.rank(method="min"), kser.rank(method="min").sort_index()) + self.assert_eq(pser.rank(method="max"), kser.rank(method="max").sort_index()) + self.assert_eq(pser.rank(method="first"), kser.rank(method="first").sort_index()) + self.assert_eq(pser.rank(method="dense"), kser.rank(method="dense").sort_index()) msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" with self.assertRaisesRegex(ValueError, msg): - psser.rank(method="nothing") + kser.rank(method="nothing") def test_round(self): pser = pd.Series([0.028208, 0.038683, 0.877076], name="x") - psser = ps.from_pandas(pser) - self.assert_eq(pser.round(2), psser.round(2)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.round(2), kser.round(2)) msg = "decimals must be an integer" - with self.assertRaisesRegex(TypeError, msg): - psser.round(1.5) + with self.assertRaisesRegex(ValueError, msg): + kser.round(1.5) def test_quantile(self): pser = pd.Series([]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.quantile(0.5), pser.quantile(0.5)) - self.assert_eq(psser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) + self.assert_eq(kser.quantile(0.5), pser.quantile(0.5)) + self.assert_eq(kser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) - with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a") - with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") - with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): @@ -1377,151 +1371,149 @@ def test_quantile(self): def test_idxmax(self): pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmax(), pser.idxmax()) - self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False)) + self.assertEqual(kser.idxmax(), pser.idxmax()) + self.assertEqual(kser.idxmax(skipna=False), pser.idxmax(skipna=False)) index = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") ) pser = pd.Series(data=[1, 2, 4, 5], index=index) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmax(), pser.idxmax()) - self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False)) + self.assertEqual(kser.idxmax(), pser.idxmax()) + self.assertEqual(kser.idxmax(skipna=False), pser.idxmax(skipna=False)) - psser = ps.Series([]) + kser = ps.Series([]) with self.assertRaisesRegex(ValueError, "an empty sequence"): - psser.idxmax() + kser.idxmax() pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8]) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmax(), pser.idxmax()) - self.assertEqual(repr(psser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False))) + self.assertEqual(kser.idxmax(), pser.idxmax()) + self.assertEqual(repr(kser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False))) def test_idxmin(self): pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"]) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmin(), pser.idxmin()) - self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False)) + self.assertEqual(kser.idxmin(), pser.idxmin()) + self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) index = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second") ) pser = pd.Series(data=[1, 2, 4, 5], index=index) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmin(), pser.idxmin()) - self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False)) + self.assertEqual(kser.idxmin(), pser.idxmin()) + self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) - psser = ps.Series([]) + kser = ps.Series([]) with self.assertRaisesRegex(ValueError, "an empty sequence"): - psser.idxmin() + kser.idxmin() pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8]) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assertEqual(psser.idxmin(), pser.idxmin()) - self.assertEqual(repr(psser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False))) + self.assertEqual(kser.idxmin(), pser.idxmin()) + self.assertEqual(repr(kser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False))) def test_shift(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.shift(2), pser.shift(2)) - self.assert_eq(psser.shift().shift(-1), pser.shift().shift(-1)) - self.assert_eq(psser.shift().sum(), pser.shift().sum()) + self.assert_eq(kser.shift(2), pser.shift(2)) + self.assert_eq(kser.shift().shift(-1), pser.shift().shift(-1)) + self.assert_eq(kser.shift().sum(), pser.shift().sum()) if LooseVersion(pd.__version__) < LooseVersion("0.24.2"): - self.assert_eq(psser.shift(periods=2), pser.shift(periods=2)) + self.assert_eq(kser.shift(periods=2), pser.shift(periods=2)) else: - self.assert_eq( - psser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0) - ) - with self.assertRaisesRegex(TypeError, "periods should be an int; however"): - psser.shift(periods=1.5) + self.assert_eq(kser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0)) + with self.assertRaisesRegex(ValueError, "periods should be an int; however"): + kser.shift(periods=1.5) def test_diff(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.diff(2), pser.diff(2)) - self.assert_eq(psser.diff().diff(-1), pser.diff().diff(-1)) - self.assert_eq(psser.diff().sum(), pser.diff().sum()) + self.assert_eq(kser.diff(2), pser.diff(2)) + self.assert_eq(kser.diff().diff(-1), pser.diff().diff(-1)) + self.assert_eq(kser.diff().sum(), pser.diff().sum()) def _test_numeric_astype(self, pser): - psser = ps.Series(pser) - - self.assert_eq(psser.astype(int), pser.astype(int)) - self.assert_eq(psser.astype(np.int), pser.astype(np.int)) - self.assert_eq(psser.astype(np.int8), pser.astype(np.int8)) - self.assert_eq(psser.astype(np.int16), pser.astype(np.int16)) - self.assert_eq(psser.astype(np.int32), pser.astype(np.int32)) - self.assert_eq(psser.astype(np.int64), pser.astype(np.int64)) - self.assert_eq(psser.astype(np.byte), pser.astype(np.byte)) - self.assert_eq(psser.astype("int"), pser.astype("int")) - self.assert_eq(psser.astype("int8"), pser.astype("int8")) - self.assert_eq(psser.astype("int16"), pser.astype("int16")) - self.assert_eq(psser.astype("int32"), pser.astype("int32")) - self.assert_eq(psser.astype("int64"), pser.astype("int64")) - self.assert_eq(psser.astype("b"), pser.astype("b")) - self.assert_eq(psser.astype("byte"), pser.astype("byte")) - self.assert_eq(psser.astype("i"), pser.astype("i")) - self.assert_eq(psser.astype("long"), pser.astype("long")) - self.assert_eq(psser.astype("short"), pser.astype("short")) - self.assert_eq(psser.astype(np.float), pser.astype(np.float)) - self.assert_eq(psser.astype(np.float32), pser.astype(np.float32)) - self.assert_eq(psser.astype(np.float64), pser.astype(np.float64)) - self.assert_eq(psser.astype("float"), pser.astype("float")) - self.assert_eq(psser.astype("float32"), pser.astype("float32")) - self.assert_eq(psser.astype("float64"), pser.astype("float64")) - self.assert_eq(psser.astype("double"), pser.astype("double")) - self.assert_eq(psser.astype("f"), pser.astype("f")) - self.assert_eq(psser.astype(bool), pser.astype(bool)) - self.assert_eq(psser.astype("bool"), pser.astype("bool")) - self.assert_eq(psser.astype("?"), pser.astype("?")) - self.assert_eq(psser.astype(np.unicode_), pser.astype(np.unicode_)) - self.assert_eq(psser.astype("str"), pser.astype("str")) - self.assert_eq(psser.astype("U"), pser.astype("U")) + kser = ps.Series(pser) + + self.assert_eq(kser.astype(int), pser.astype(int)) + self.assert_eq(kser.astype(np.int), pser.astype(np.int)) + self.assert_eq(kser.astype(np.int8), pser.astype(np.int8)) + self.assert_eq(kser.astype(np.int16), pser.astype(np.int16)) + self.assert_eq(kser.astype(np.int32), pser.astype(np.int32)) + self.assert_eq(kser.astype(np.int64), pser.astype(np.int64)) + self.assert_eq(kser.astype(np.byte), pser.astype(np.byte)) + self.assert_eq(kser.astype("int"), pser.astype("int")) + self.assert_eq(kser.astype("int8"), pser.astype("int8")) + self.assert_eq(kser.astype("int16"), pser.astype("int16")) + self.assert_eq(kser.astype("int32"), pser.astype("int32")) + self.assert_eq(kser.astype("int64"), pser.astype("int64")) + self.assert_eq(kser.astype("b"), pser.astype("b")) + self.assert_eq(kser.astype("byte"), pser.astype("byte")) + self.assert_eq(kser.astype("i"), pser.astype("i")) + self.assert_eq(kser.astype("long"), pser.astype("long")) + self.assert_eq(kser.astype("short"), pser.astype("short")) + self.assert_eq(kser.astype(np.float), pser.astype(np.float)) + self.assert_eq(kser.astype(np.float32), pser.astype(np.float32)) + self.assert_eq(kser.astype(np.float64), pser.astype(np.float64)) + self.assert_eq(kser.astype("float"), pser.astype("float")) + self.assert_eq(kser.astype("float32"), pser.astype("float32")) + self.assert_eq(kser.astype("float64"), pser.astype("float64")) + self.assert_eq(kser.astype("double"), pser.astype("double")) + self.assert_eq(kser.astype("f"), pser.astype("f")) + self.assert_eq(kser.astype(bool), pser.astype(bool)) + self.assert_eq(kser.astype("bool"), pser.astype("bool")) + self.assert_eq(kser.astype("?"), pser.astype("?")) + self.assert_eq(kser.astype(np.unicode_), pser.astype(np.unicode_)) + self.assert_eq(kser.astype("str"), pser.astype("str")) + self.assert_eq(kser.astype("U"), pser.astype("U")) if extension_dtypes_available: from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype - self._check_extension(psser.astype("Int8"), pser.astype("Int8")) - self._check_extension(psser.astype("Int16"), pser.astype("Int16")) - self._check_extension(psser.astype("Int32"), pser.astype("Int32")) - self._check_extension(psser.astype("Int64"), pser.astype("Int64")) - self._check_extension(psser.astype(Int8Dtype()), pser.astype(Int8Dtype())) - self._check_extension(psser.astype(Int16Dtype()), pser.astype(Int16Dtype())) - self._check_extension(psser.astype(Int32Dtype()), pser.astype(Int32Dtype())) - self._check_extension(psser.astype(Int64Dtype()), pser.astype(Int64Dtype())) + self._check_extension(kser.astype("Int8"), pser.astype("Int8")) + self._check_extension(kser.astype("Int16"), pser.astype("Int16")) + self._check_extension(kser.astype("Int32"), pser.astype("Int32")) + self._check_extension(kser.astype("Int64"), pser.astype("Int64")) + self._check_extension(kser.astype(Int8Dtype()), pser.astype(Int8Dtype())) + self._check_extension(kser.astype(Int16Dtype()), pser.astype(Int16Dtype())) + self._check_extension(kser.astype(Int32Dtype()), pser.astype(Int32Dtype())) + self._check_extension(kser.astype(Int64Dtype()), pser.astype(Int64Dtype())) if extension_object_dtypes_available: from pandas import StringDtype if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self._check_extension(psser.astype("string"), pser.astype("string")) - self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(kser.astype("string"), pser.astype("string")) + self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) else: self._check_extension( - psser.astype("string"), + kser.astype("string"), pd.Series(["10", "20", "15", "30", "45"], name="x", dtype="string"), ) self._check_extension( - psser.astype(StringDtype()), + kser.astype(StringDtype()), pd.Series(["10", "20", "15", "30", "45"], name="x", dtype=StringDtype()), ) if extension_float_dtypes_available: from pandas import Float32Dtype, Float64Dtype - self._check_extension(psser.astype("Float32"), pser.astype("Float32")) - self._check_extension(psser.astype("Float64"), pser.astype("Float64")) - self._check_extension(psser.astype(Float32Dtype()), pser.astype(Float32Dtype())) - self._check_extension(psser.astype(Float64Dtype()), pser.astype(Float64Dtype())) + self._check_extension(kser.astype("Float32"), pser.astype("Float32")) + self._check_extension(kser.astype("Float64"), pser.astype("Float64")) + self._check_extension(kser.astype(Float32Dtype()), pser.astype(Float32Dtype())) + self._check_extension(kser.astype(Float64Dtype()), pser.astype(Float64Dtype())) def test_astype(self): psers = [pd.Series([10, 20, 15, 30, 45], name="x")] @@ -1535,101 +1527,100 @@ def test_astype(self): self._test_numeric_astype(pser) pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.astype(bool), pser.astype(bool)) - self.assert_eq(psser.astype(str), pser.astype(str)) + self.assert_eq(kser.astype(bool), pser.astype(bool)) + self.assert_eq(kser.astype(str), pser.astype(str)) pser = pd.Series(["hi", "hi ", " ", " \t", "", None], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.astype(bool), pser.astype(bool)) + self.assert_eq(kser.astype(bool), pser.astype(bool)) if LooseVersion("1.1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.1.4"): # a pandas bug: https://github.com/databricks/koalas/pull/1818#issuecomment-703961980 - self.assert_eq(psser.astype(str).tolist(), ["hi", "hi ", " ", " \t", "", "None"]) + self.assert_eq(kser.astype(str).tolist(), ["hi", "hi ", " ", " \t", "", "None"]) else: - self.assert_eq(psser.astype(str), pser.astype(str)) - self.assert_eq(psser.str.strip().astype(bool), pser.str.strip().astype(bool)) + self.assert_eq(kser.astype(str), pser.astype(str)) + self.assert_eq(kser.str.strip().astype(bool), pser.str.strip().astype(bool)) if extension_object_dtypes_available: from pandas import StringDtype - self._check_extension(psser.astype("string"), pser.astype("string")) - self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(kser.astype("string"), pser.astype("string")) + self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) pser = pd.Series([True, False, None], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.astype(bool), pser.astype(bool)) - self.assert_eq(psser.astype(str), pser.astype(str)) + self.assert_eq(kser.astype(bool), pser.astype(bool)) + # Comment out the below test cause because pandas returns `None` or `nan` randomly + # self.assert_eq(kser.astype(str), pser.astype(str)) if extension_object_dtypes_available: from pandas import BooleanDtype, StringDtype - self._check_extension(psser.astype("boolean"), pser.astype("boolean")) - self._check_extension(psser.astype(BooleanDtype()), pser.astype(BooleanDtype())) + self._check_extension(kser.astype("boolean"), pser.astype("boolean")) + self._check_extension(kser.astype(BooleanDtype()), pser.astype(BooleanDtype())) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self._check_extension(psser.astype("string"), pser.astype("string")) - self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype())) + self._check_extension(kser.astype("string"), pser.astype("string")) + self._check_extension(kser.astype(StringDtype()), pser.astype(StringDtype())) else: self._check_extension( - psser.astype("string"), + kser.astype("string"), pd.Series(["True", "False", None], name="x", dtype="string"), ) self._check_extension( - psser.astype(StringDtype()), + kser.astype(StringDtype()), pd.Series(["True", "False", None], name="x", dtype=StringDtype()), ) pser = pd.Series(["2020-10-27 00:00:01", None], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.astype(np.datetime64), pser.astype(np.datetime64)) - self.assert_eq(psser.astype("datetime64[ns]"), pser.astype("datetime64[ns]")) - self.assert_eq(psser.astype("M"), pser.astype("M")) - self.assert_eq(psser.astype("M").astype(str), pser.astype("M").astype(str)) + self.assert_eq(kser.astype(np.datetime64), pser.astype(np.datetime64)) + self.assert_eq(kser.astype("datetime64[ns]"), pser.astype("datetime64[ns]")) + self.assert_eq(kser.astype("M"), pser.astype("M")) + self.assert_eq(kser.astype("M").astype(str), pser.astype("M").astype(str)) # Comment out the below test cause because pandas returns `NaT` or `nan` randomly - # self.assert_eq( - # psser.astype("M").dt.date.astype(str), pser.astype("M").dt.date.astype(str) - # ) + # self.assert_eq(kser.astype("M").dt.date.astype(str), pser.astype("M").dt.date.astype(str)) if extension_object_dtypes_available: from pandas import StringDtype self._check_extension( - psser.astype("M").astype("string"), pser.astype("M").astype("string") + kser.astype("M").astype("string"), pser.astype("M").astype("string") ) self._check_extension( - psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) + kser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) ) with self.assertRaisesRegex(TypeError, "not understood"): - psser.astype("int63") + kser.astype("int63") def test_aggregate(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) msg = "func must be a string or list of strings" - with self.assertRaisesRegex(TypeError, msg): - psser.aggregate({"x": ["min", "max"]}) + with self.assertRaisesRegex(ValueError, msg): + kser.aggregate({"x": ["min", "max"]}) msg = ( "If the given function is a list, it " "should only contains function names as strings." ) with self.assertRaisesRegex(ValueError, msg): - psser.aggregate(["min", max]) + kser.aggregate(["min", max]) def test_drop(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.drop(1), pser.drop(1)) - self.assert_eq(psser.drop([1, 4]), pser.drop([1, 4])) + self.assert_eq(kser.drop(1), pser.drop(1)) + self.assert_eq(kser.drop([1, 4]), pser.drop([1, 4])) msg = "Need to specify at least one of 'labels' or 'index'" with self.assertRaisesRegex(ValueError, msg): - psser.drop() - self.assertRaises(KeyError, lambda: psser.drop((0, 1))) + kser.drop() + self.assertRaises(KeyError, lambda: kser.drop((0, 1))) # For MultiIndex midx = pd.MultiIndex( @@ -1637,20 +1628,20 @@ def test_drop(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.drop("lama"), pser.drop("lama")) - self.assert_eq(psser.drop(labels="weight", level=1), pser.drop(labels="weight", level=1)) - self.assert_eq(psser.drop(("lama", "weight")), pser.drop(("lama", "weight"))) + self.assert_eq(kser.drop("lama"), pser.drop("lama")) + self.assert_eq(kser.drop(labels="weight", level=1), pser.drop(labels="weight", level=1)) + self.assert_eq(kser.drop(("lama", "weight")), pser.drop(("lama", "weight"))) self.assert_eq( - psser.drop([("lama", "speed"), ("falcon", "weight")]), + kser.drop([("lama", "speed"), ("falcon", "weight")]), pser.drop([("lama", "speed"), ("falcon", "weight")]), ) - self.assert_eq(psser.drop({"lama": "speed"}), pser.drop({"lama": "speed"})) + self.assert_eq(kser.drop({"lama": "speed"}), pser.drop({"lama": "speed"})) msg = "'level' should be less than the number of indexes" with self.assertRaisesRegex(ValueError, msg): - psser.drop(labels="weight", level=2) + kser.drop(labels="weight", level=2) msg = ( "If the given index is a list, it " @@ -1658,15 +1649,15 @@ def test_drop(self): "that contain index names" ) with self.assertRaisesRegex(ValueError, msg): - psser.drop(["lama", ["cow", "falcon"]]) + kser.drop(["lama", ["cow", "falcon"]]) msg = "Cannot specify both 'labels' and 'index'" with self.assertRaisesRegex(ValueError, msg): - psser.drop("lama", index="cow") + kser.drop("lama", index="cow") msg = r"'Key length \(2\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - psser.drop(("lama", "speed", "x")) + kser.drop(("lama", "speed", "x")) def test_pop(self): midx = pd.MultiIndex( @@ -1674,41 +1665,41 @@ def test_pop(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pdf = pd.DataFrame({"x": [45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3]}, index=midx) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.pop(("lama", "speed")), pser.pop(("lama", "speed"))) - self.assert_eq(psser, pser) - self.assert_eq(psdf, pdf) + self.assert_eq(kser.pop(("lama", "speed")), pser.pop(("lama", "speed"))) + self.assert_eq(kser, pser) + self.assert_eq(kdf, pdf) msg = r"'Key length \(3\) exceeds index depth \(2\)'" with self.assertRaisesRegex(KeyError, msg): - psser.pop(("lama", "speed", "x")) + kser.pop(("lama", "speed", "x")) def test_replace(self): pser = pd.Series([10, 20, 15, 30, np.nan], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser.replace(), pser.replace()) - self.assert_eq(psser.replace({}), pser.replace({})) + self.assert_eq(kser.replace(), pser.replace()) + self.assert_eq(kser.replace({}), pser.replace({})) - self.assert_eq(psser.replace(np.nan, 45), pser.replace(np.nan, 45)) - self.assert_eq(psser.replace([10, 15], 45), pser.replace([10, 15], 45)) - self.assert_eq(psser.replace((10, 15), 45), pser.replace((10, 15), 45)) - self.assert_eq(psser.replace([10, 15], [45, 50]), pser.replace([10, 15], [45, 50])) - self.assert_eq(psser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) + self.assert_eq(kser.replace(np.nan, 45), pser.replace(np.nan, 45)) + self.assert_eq(kser.replace([10, 15], 45), pser.replace([10, 15], 45)) + self.assert_eq(kser.replace((10, 15), 45), pser.replace((10, 15), 45)) + self.assert_eq(kser.replace([10, 15], [45, 50]), pser.replace([10, 15], [45, 50])) + self.assert_eq(kser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) msg = "'to_replace' should be one of str, list, tuple, dict, int, float" - with self.assertRaisesRegex(TypeError, msg): - psser.replace(ps.range(5)) + with self.assertRaisesRegex(ValueError, msg): + kser.replace(ps.range(5)) msg = "Replacement lists must match in length. Expecting 3 got 2" with self.assertRaisesRegex(ValueError, msg): - psser.replace([10, 20, 30], [1, 2]) + kser.replace([10, 20, 30], [1, 2]) msg = "replace currently not support for regex" with self.assertRaisesRegex(NotImplementedError, msg): - psser.replace(r"^1.$", regex=True) + kser.replace(r"^1.$", regex=True) def test_xs(self): midx = pd.MultiIndex( @@ -1716,9 +1707,9 @@ def test_xs(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.xs(("a", "lama", "speed")), pser.xs(("a", "lama", "speed"))) + self.assert_eq(kser.xs(("a", "lama", "speed")), pser.xs(("a", "lama", "speed"))) def test_duplicates(self): psers = { @@ -1731,70 +1722,70 @@ def test_duplicates(self): for (msg, pser), keep in product(psers.items(), keeps): with self.subTest(msg, keep=keep): - psser = ps.Series(pser) + kser = ps.Series(pser) self.assert_eq( pser.drop_duplicates(keep=keep).sort_values(), - psser.drop_duplicates(keep=keep).sort_values(), + kser.drop_duplicates(keep=keep).sort_values(), ) def test_update(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") - psser = ps.Series(pser) + kser = ps.Series(pser) msg = "'other' must be a Series" - with self.assertRaisesRegex(TypeError, msg): - psser.update(10) + with self.assertRaisesRegex(ValueError, msg): + kser.update(10) def test_where(self): pser1 = pd.Series([0, 1, 2, 3, 4]) - psser1 = ps.from_pandas(pser1) + kser1 = ps.from_pandas(pser1) - self.assert_eq(pser1.where(pser1 > 3), psser1.where(psser1 > 3).sort_index()) + self.assert_eq(pser1.where(pser1 > 3), kser1.where(kser1 > 3).sort_index()) def test_mask(self): pser1 = pd.Series([0, 1, 2, 3, 4]) - psser1 = ps.from_pandas(pser1) + kser1 = ps.from_pandas(pser1) - self.assert_eq(pser1.mask(pser1 > 3), psser1.mask(psser1 > 3).sort_index()) + self.assert_eq(pser1.mask(pser1 > 3), kser1.mask(kser1 > 3).sort_index()) def test_truncate(self): pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) - psser1 = ps.Series(pser1) + kser1 = ps.Series(pser1) pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1]) - psser2 = ps.Series(pser2) + kser2 = ps.Series(pser2) - self.assert_eq(psser1.truncate(), pser1.truncate()) - self.assert_eq(psser1.truncate(before=2), pser1.truncate(before=2)) - self.assert_eq(psser1.truncate(after=5), pser1.truncate(after=5)) - self.assert_eq(psser1.truncate(copy=False), pser1.truncate(copy=False)) - self.assert_eq(psser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False)) + self.assert_eq(kser1.truncate(), pser1.truncate()) + self.assert_eq(kser1.truncate(before=2), pser1.truncate(before=2)) + self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5)) + self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False)) + self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False)) # The bug for these tests has been fixed in pandas 1.1.0. if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - self.assert_eq(psser2.truncate(4, 6), pser2.truncate(4, 6)) - self.assert_eq(psser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False)) + self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6)) + self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False)) else: - expected_psser = ps.Series([20, 30, 40], index=[6, 5, 4]) - self.assert_eq(psser2.truncate(4, 6), expected_psser) - self.assert_eq(psser2.truncate(4, 6, copy=False), expected_psser) + expected_kser = ps.Series([20, 30, 40], index=[6, 5, 4]) + self.assert_eq(kser2.truncate(4, 6), expected_kser) + self.assert_eq(kser2.truncate(4, 6, copy=False), expected_kser) - psser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1]) + kser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1]) msg = "truncate requires a sorted index" with self.assertRaisesRegex(ValueError, msg): - psser.truncate() + kser.truncate() - psser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) + kser = ps.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) msg = "Truncate: 2 must be after 5" with self.assertRaisesRegex(ValueError, msg): - psser.truncate(5, 2) + kser.truncate(5, 2) def test_getitem(self): pser = pd.Series([10, 20, 15, 30, 45], ["A", "A", "B", "C", "D"]) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser["A"], pser["A"]) - self.assert_eq(psser["B"], pser["B"]) - self.assert_eq(psser[psser > 15], pser[pser > 15]) + self.assert_eq(kser["A"], pser["A"]) + self.assert_eq(kser["B"], pser["B"]) + self.assert_eq(kser[kser > 15], pser[pser > 15]) # for MultiIndex midx = pd.MultiIndex( @@ -1802,15 +1793,15 @@ def test_getitem(self): [[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], name="0", index=midx) - psser = ps.Series(pser) + kser = ps.Series(pser) - self.assert_eq(psser["a"], pser["a"]) - self.assert_eq(psser["a", "lama"], pser["a", "lama"]) - self.assert_eq(psser[psser > 1.5], pser[pser > 1.5]) + self.assert_eq(kser["a"], pser["a"]) + self.assert_eq(kser["a", "lama"], pser["a", "lama"]) + self.assert_eq(kser[kser > 1.5], pser[pser > 1.5]) msg = r"'Key length \(4\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): - psser[("a", "lama", "speed", "x")] + kser[("a", "lama", "speed", "x")] def test_keys(self): midx = pd.MultiIndex( @@ -1818,35 +1809,35 @@ def test_keys(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.keys(), pser.keys()) + self.assert_eq(kser.keys(), pser.keys()) def test_index(self): # to check setting name of Index properly. idx = pd.Index([1, 2, 3, 4, 5, 6, 7, 8, 9]) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - psser.name = "koalas" + kser.name = "koalas" pser.name = "koalas" - self.assert_eq(psser.index.name, pser.index.name) + self.assert_eq(kser.index.name, pser.index.name) # for check setting names of MultiIndex properly. - psser.names = ["hello", "koalas"] + kser.names = ["hello", "koalas"] pser.names = ["hello", "koalas"] - self.assert_eq(psser.index.names, pser.index.names) + self.assert_eq(kser.index.names, pser.index.names) def test_pct_change(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False) - self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True) - self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) - self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) - self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) - self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) + self.assert_eq(kser.pct_change(), pser.pct_change(), check_exact=False) + self.assert_eq(kser.pct_change().sum(), pser.pct_change().sum(), almost=True) + self.assert_eq(kser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) + self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) + self.assert_eq(kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) + self.assert_eq(kser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) # for MultiIndex midx = pd.MultiIndex( @@ -1854,19 +1845,19 @@ def test_pct_change(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False) - self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True) - self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) - self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) - self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) - self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) + self.assert_eq(kser.pct_change(), pser.pct_change(), check_exact=False) + self.assert_eq(kser.pct_change().sum(), pser.pct_change().sum(), almost=True) + self.assert_eq(kser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False) + self.assert_eq(kser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False) + self.assert_eq(kser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000)) + self.assert_eq(kser.pct_change(periods=100000000), pser.pct_change(periods=100000000)) def test_axes(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) - psser = ps.from_pandas(pser) - self.assert_eq(psser.axes, pser.axes) + kser = ps.from_pandas(pser) + self.assert_eq(kser.axes, pser.axes) # for MultiIndex midx = pd.MultiIndex( @@ -1874,332 +1865,332 @@ def test_axes(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - psser = ps.from_pandas(pser) - self.assert_eq(psser.axes, pser.axes) + kser = ps.from_pandas(pser) + self.assert_eq(kser.axes, pser.axes) def test_udt(self): sparse_values = {0: 0.1, 1: 1.1} sparse_vector = SparseVector(len(sparse_values), sparse_values) pser = pd.Series([sparse_vector]) - psser = ps.from_pandas(pser) - self.assert_eq(psser, pser) + kser = ps.from_pandas(pser) + self.assert_eq(kser, pser) def test_repeat(self): pser = pd.Series(["a", "b", "c"], name="0", index=np.random.rand(3)) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.repeat(3).sort_index(), pser.repeat(3).sort_index()) - self.assert_eq(psser.repeat(0).sort_index(), pser.repeat(0).sort_index()) + self.assert_eq(kser.repeat(3).sort_index(), pser.repeat(3).sort_index()) + self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index()) - self.assertRaises(ValueError, lambda: psser.repeat(-1)) - self.assertRaises(TypeError, lambda: psser.repeat("abc")) + self.assertRaises(ValueError, lambda: kser.repeat(-1)) + self.assertRaises(ValueError, lambda: kser.repeat("abc")) pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3)) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq(psdf.a.repeat(psdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index()) + self.assert_eq(kdf.a.repeat(kdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index()) def test_take(self): pser = pd.Series([100, 200, 300, 400, 500], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.take([0, 2, 4]).sort_values(), pser.take([0, 2, 4]).sort_values()) + self.assert_eq(kser.take([0, 2, 4]).sort_values(), pser.take([0, 2, 4]).sort_values()) self.assert_eq( - psser.take(range(0, 5, 2)).sort_values(), pser.take(range(0, 5, 2)).sort_values() + kser.take(range(0, 5, 2)).sort_values(), pser.take(range(0, 5, 2)).sort_values() ) - self.assert_eq(psser.take([-4, -2, 0]).sort_values(), pser.take([-4, -2, 0]).sort_values()) + self.assert_eq(kser.take([-4, -2, 0]).sort_values(), pser.take([-4, -2, 0]).sort_values()) self.assert_eq( - psser.take(range(-2, 1, 2)).sort_values(), pser.take(range(-2, 1, 2)).sort_values() + kser.take(range(-2, 1, 2)).sort_values(), pser.take(range(-2, 1, 2)).sort_values() ) # Checking the type of indices. - self.assertRaises(TypeError, lambda: psser.take(1)) - self.assertRaises(TypeError, lambda: psser.take("1")) - self.assertRaises(TypeError, lambda: psser.take({1, 2})) - self.assertRaises(TypeError, lambda: psser.take({1: None, 2: None})) + self.assertRaises(ValueError, lambda: kser.take(1)) + self.assertRaises(ValueError, lambda: kser.take("1")) + self.assertRaises(ValueError, lambda: kser.take({1, 2})) + self.assertRaises(ValueError, lambda: kser.take({1: None, 2: None})) def test_divmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - kdiv, kmod = psser.divmod(-100) + kdiv, kmod = kser.divmod(-100) pdiv, pmod = pser.divmod(-100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) - kdiv, kmod = psser.divmod(100) + kdiv, kmod = kser.divmod(100) pdiv, pmod = pser.divmod(100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - kdiv, kmod = psser.divmod(-100) + kdiv, kmod = kser.divmod(-100) pdiv, pmod = pser.floordiv(-100), pser.mod(-100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) - kdiv, kmod = psser.divmod(100) + kdiv, kmod = kser.divmod(100) pdiv, pmod = pser.floordiv(100), pser.mod(100) self.assert_eq(kdiv, pdiv) self.assert_eq(kmod, pmod) def test_rdivmod(self): pser = pd.Series([100, None, 300, None, 500]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - krdiv, krmod = psser.rdivmod(-100) + krdiv, krmod = kser.rdivmod(-100) prdiv, prmod = pser.rdivmod(-100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) - krdiv, krmod = psser.rdivmod(100) + krdiv, krmod = kser.rdivmod(100) prdiv, prmod = pser.rdivmod(100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - krdiv, krmod = psser.rdivmod(-100) + krdiv, krmod = kser.rdivmod(-100) prdiv, prmod = pser.rfloordiv(-100), pser.rmod(-100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) - krdiv, krmod = psser.rdivmod(100) + krdiv, krmod = kser.rdivmod(100) prdiv, prmod = pser.rfloordiv(100), pser.rmod(100) self.assert_eq(krdiv, prdiv) self.assert_eq(krmod, prmod) def test_mod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.mod(-150), pser.mod(-150)) - self.assert_eq(psser.mod(0), pser.mod(0)) - self.assert_eq(psser.mod(150), pser.mod(150)) + self.assert_eq(kser.mod(-150), pser.mod(-150)) + self.assert_eq(kser.mod(0), pser.mod(0)) + self.assert_eq(kser.mod(150), pser.mod(150)) pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6}) - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.a.mod(psdf.b), pdf.a.mod(pdf.b)) + kdf = ps.from_pandas(pdf) + self.assert_eq(kdf.a.mod(kdf.b), pdf.a.mod(pdf.b)) def test_mode(self): pser = pd.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan]) - psser = ps.from_pandas(pser) - self.assert_eq(psser.mode(), pser.mode()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.mode(), pser.mode()) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `dropna` argument is added in pandas 0.24. self.assert_eq( - psser.mode(dropna=False).sort_values().reset_index(drop=True), + kser.mode(dropna=False).sort_values().reset_index(drop=True), pser.mode(dropna=False).sort_values().reset_index(drop=True), ) pser.name = "x" - psser = ps.from_pandas(pser) - self.assert_eq(psser.mode(), pser.mode()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.mode(), pser.mode()) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `dropna` argument is added in pandas 0.24. self.assert_eq( - psser.mode(dropna=False).sort_values().reset_index(drop=True), + kser.mode(dropna=False).sort_values().reset_index(drop=True), pser.mode(dropna=False).sort_values().reset_index(drop=True), ) def test_rmod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.rmod(-150), pser.rmod(-150)) - self.assert_eq(psser.rmod(0), pser.rmod(0)) - self.assert_eq(psser.rmod(150), pser.rmod(150)) + self.assert_eq(kser.rmod(-150), pser.rmod(-150)) + self.assert_eq(kser.rmod(0), pser.rmod(0)) + self.assert_eq(kser.rmod(150), pser.rmod(150)) pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6}) - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.a.rmod(psdf.b), pdf.a.rmod(pdf.b)) + kdf = ps.from_pandas(pdf) + self.assert_eq(kdf.a.rmod(kdf.b), pdf.a.rmod(pdf.b)) def test_asof(self): pser = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.asof(20), pser.asof(20)) - self.assert_eq(psser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index()) - self.assert_eq(psser.asof(100), pser.asof(100)) - self.assert_eq(repr(psser.asof(-100)), repr(pser.asof(-100))) - self.assert_eq(psser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index()) + self.assert_eq(kser.asof(20), pser.asof(20)) + self.assert_eq(kser.asof([5, 20]).sort_index(), pser.asof([5, 20]).sort_index()) + self.assert_eq(kser.asof(100), pser.asof(100)) + self.assert_eq(repr(kser.asof(-100)), repr(pser.asof(-100))) + self.assert_eq(kser.asof([-100, 100]).sort_index(), pser.asof([-100, 100]).sort_index()) # where cannot be an Index, Series or a DataFrame - self.assertRaises(ValueError, lambda: psser.asof(ps.Index([-100, 100]))) - self.assertRaises(ValueError, lambda: psser.asof(ps.Series([-100, 100]))) - self.assertRaises(ValueError, lambda: psser.asof(ps.DataFrame({"A": [1, 2, 3]}))) + self.assertRaises(ValueError, lambda: kser.asof(ps.Index([-100, 100]))) + self.assertRaises(ValueError, lambda: kser.asof(ps.Series([-100, 100]))) + self.assertRaises(ValueError, lambda: kser.asof(ps.DataFrame({"A": [1, 2, 3]}))) # asof is not supported for a MultiIndex pser.index = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c"), ("y", "d")]) - psser = ps.from_pandas(pser) - self.assertRaises(ValueError, lambda: psser.asof(20)) + kser = ps.from_pandas(pser) + self.assertRaises(ValueError, lambda: kser.asof(20)) # asof requires a sorted index (More precisely, should be a monotonic increasing) - psser = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40], name="Koalas") - self.assertRaises(ValueError, lambda: psser.asof(20)) - psser = ps.Series([1, 2, np.nan, 4], index=[40, 30, 20, 10], name="Koalas") - self.assertRaises(ValueError, lambda: psser.asof(20)) + kser = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40], name="Koalas") + self.assertRaises(ValueError, lambda: kser.asof(20)) + kser = ps.Series([1, 2, np.nan, 4], index=[40, 30, 20, 10], name="Koalas") + self.assertRaises(ValueError, lambda: kser.asof(20)) pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"]) pser = pd.Series([1, 2, np.nan], index=pidx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.asof("2014-01-01"), pser.asof("2014-01-01")) - self.assert_eq(psser.asof("2014-01-02"), pser.asof("2014-01-02")) - self.assert_eq(repr(psser.asof("1999-01-02")), repr(pser.asof("1999-01-02"))) + self.assert_eq(kser.asof("2014-01-01"), pser.asof("2014-01-01")) + self.assert_eq(kser.asof("2014-01-02"), pser.asof("2014-01-02")) + self.assert_eq(repr(kser.asof("1999-01-02")), repr(pser.asof("1999-01-02"))) def test_squeeze(self): # Single value pser = pd.Series([90]) - psser = ps.from_pandas(pser) - self.assert_eq(psser.squeeze(), pser.squeeze()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.squeeze(), pser.squeeze()) # Single value with MultiIndex midx = pd.MultiIndex.from_tuples([("a", "b", "c")]) pser = pd.Series([90], index=midx) - psser = ps.from_pandas(pser) - self.assert_eq(psser.squeeze(), pser.squeeze()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.squeeze(), pser.squeeze()) # Multiple values pser = pd.Series([90, 91, 85]) - psser = ps.from_pandas(pser) - self.assert_eq(psser.squeeze(), pser.squeeze()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.squeeze(), pser.squeeze()) # Multiple values with MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series([90, 91, 85], index=midx) - psser = ps.from_pandas(pser) - self.assert_eq(psser.squeeze(), pser.squeeze()) + kser = ps.from_pandas(pser) + self.assert_eq(kser.squeeze(), pser.squeeze()) def test_swaplevel(self): # MultiIndex with two levels arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pser = pd.Series(["a", "b", "c", "d"], index=pidx) - psser = ps.from_pandas(pser) - self.assert_eq(pser.swaplevel(), psser.swaplevel()) - self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1)) - self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1)) - self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.swaplevel(), kser.swaplevel()) + self.assert_eq(pser.swaplevel(0, 1), kser.swaplevel(0, 1)) + self.assert_eq(pser.swaplevel(1, 1), kser.swaplevel(1, 1)) + self.assert_eq(pser.swaplevel("number", "color"), kser.swaplevel("number", "color")) # MultiIndex with more than two levels arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"], ["l", "m", "s", "xs"]] pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color", "size")) pser = pd.Series(["a", "b", "c", "d"], index=pidx) - psser = ps.from_pandas(pser) - self.assert_eq(pser.swaplevel(), psser.swaplevel()) - self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1)) - self.assert_eq(pser.swaplevel(0, 2), psser.swaplevel(0, 2)) - self.assert_eq(pser.swaplevel(1, 2), psser.swaplevel(1, 2)) - self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1)) - self.assert_eq(pser.swaplevel(-1, -2), psser.swaplevel(-1, -2)) - self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color")) - self.assert_eq(pser.swaplevel("number", "size"), psser.swaplevel("number", "size")) - self.assert_eq(pser.swaplevel("color", "size"), psser.swaplevel("color", "size")) + kser = ps.from_pandas(pser) + self.assert_eq(pser.swaplevel(), kser.swaplevel()) + self.assert_eq(pser.swaplevel(0, 1), kser.swaplevel(0, 1)) + self.assert_eq(pser.swaplevel(0, 2), kser.swaplevel(0, 2)) + self.assert_eq(pser.swaplevel(1, 2), kser.swaplevel(1, 2)) + self.assert_eq(pser.swaplevel(1, 1), kser.swaplevel(1, 1)) + self.assert_eq(pser.swaplevel(-1, -2), kser.swaplevel(-1, -2)) + self.assert_eq(pser.swaplevel("number", "color"), kser.swaplevel("number", "color")) + self.assert_eq(pser.swaplevel("number", "size"), kser.swaplevel("number", "size")) + self.assert_eq(pser.swaplevel("color", "size"), kser.swaplevel("color", "size")) # Error conditions self.assertRaises(AssertionError, lambda: ps.Series([1, 2]).swaplevel()) - self.assertRaises(IndexError, lambda: psser.swaplevel(0, 9)) - self.assertRaises(KeyError, lambda: psser.swaplevel("not_number", "color")) - self.assertRaises(AssertionError, lambda: psser.swaplevel(copy=False)) + self.assertRaises(IndexError, lambda: kser.swaplevel(0, 9)) + self.assertRaises(KeyError, lambda: kser.swaplevel("not_number", "color")) + self.assertRaises(AssertionError, lambda: kser.swaplevel(copy=False)) def test_swapaxes(self): pser = pd.Series([1, 2, 3], index=["x", "y", "z"], name="ser") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(psser.swapaxes(0, 0), pser.swapaxes(0, 0)) - self.assert_eq(psser.swapaxes("index", "index"), pser.swapaxes("index", "index")) - self.assert_eq((psser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0)) + self.assert_eq(kser.swapaxes(0, 0), pser.swapaxes(0, 0)) + self.assert_eq(kser.swapaxes("index", "index"), pser.swapaxes("index", "index")) + self.assert_eq((kser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0)) - self.assertRaises(AssertionError, lambda: psser.swapaxes(0, 1, copy=False)) - self.assertRaises(ValueError, lambda: psser.swapaxes(0, 1)) - self.assertRaises(ValueError, lambda: psser.swapaxes("index", "columns")) + self.assertRaises(AssertionError, lambda: kser.swapaxes(0, 1, copy=False)) + self.assertRaises(ValueError, lambda: kser.swapaxes(0, 1)) + self.assertRaises(ValueError, lambda: kser.swapaxes("index", "columns")) def test_div_zero_and_nan(self): pser = pd.Series([100, None, -300, None, 500, -700, np.inf, -np.inf], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.div(0), psser.div(0)) - self.assert_eq(pser.truediv(0), psser.truediv(0)) - self.assert_eq(pser / 0, psser / 0) - self.assert_eq(pser.div(np.nan), psser.div(np.nan)) - self.assert_eq(pser.truediv(np.nan), psser.truediv(np.nan)) - self.assert_eq(pser / np.nan, psser / np.nan) + self.assert_eq(pser.div(0), kser.div(0)) + self.assert_eq(pser.truediv(0), kser.truediv(0)) + self.assert_eq(pser / 0, kser / 0) + self.assert_eq(pser.div(np.nan), kser.div(np.nan)) + self.assert_eq(pser.truediv(np.nan), kser.truediv(np.nan)) + self.assert_eq(pser / np.nan, kser / np.nan) # floordiv has different behavior in pandas > 1.0.0 when divide by 0 if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(pser.floordiv(0), psser.floordiv(0)) - self.assert_eq(pser // 0, psser // 0) + self.assert_eq(pser.floordiv(0), kser.floordiv(0)) + self.assert_eq(pser // 0, kser // 0) else: result = pd.Series( [np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf], name="Koalas" ) - self.assert_eq(psser.floordiv(0), result) - self.assert_eq(psser // 0, result) - self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan)) + self.assert_eq(kser.floordiv(0), result) + self.assert_eq(kser // 0, result) + self.assert_eq(pser.floordiv(np.nan), kser.floordiv(np.nan)) def test_mad(self): pser = pd.Series([1, 2, 3, 4], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + self.assert_eq(pser.mad(), kser.mad()) pser = pd.Series([None, -2, 5, 10, 50, np.nan, -20], name="Koalas") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + self.assert_eq(pser.mad(), kser.mad()) pmidx = pd.MultiIndex.from_tuples( [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] ) pser = pd.Series([1, 2, 3, 4, 5], name="Koalas") pser.index = pmidx - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + self.assert_eq(pser.mad(), kser.mad()) pmidx = pd.MultiIndex.from_tuples( [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] ) pser = pd.Series([None, -2, 5, 50, np.nan], name="Koalas") pser.index = pmidx - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + self.assert_eq(pser.mad(), kser.mad()) def test_to_frame(self): pser = pd.Series(["a", "b", "c"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.to_frame(name="a"), psser.to_frame(name="a")) + self.assert_eq(pser.to_frame(name="a"), kser.to_frame(name="a")) # for MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series(["a", "b", "c"], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.to_frame(name="a"), psser.to_frame(name="a")) + self.assert_eq(pser.to_frame(name="a"), kser.to_frame(name="a")) def test_shape(self): pser = pd.Series(["a", "b", "c"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.shape, psser.shape) + self.assert_eq(pser.shape, kser.shape) # for MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pser = pd.Series(["a", "b", "c"], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.shape, psser.shape) + self.assert_eq(pser.shape, kser.shape) @unittest.skipIf(not have_tabulate, tabulate_requirement_message) def test_to_markdown(self): pser = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) # `to_markdown()` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0. if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assertRaises(NotImplementedError, lambda: psser.to_markdown()) + self.assertRaises(NotImplementedError, lambda: kser.to_markdown()) else: - self.assert_eq(pser.to_markdown(), psser.to_markdown()) + self.assert_eq(pser.to_markdown(), kser.to_markdown()) def test_unstack(self): pser = pd.Series( @@ -2209,112 +2200,112 @@ def test_unstack(self): names=["A", "B", "C"], ), ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) levels = [-3, -2, -1, 0, 1, 2] for level in levels: pandas_result = pser.unstack(level=level) - pandas_on_spark_result = psser.unstack(level=level).sort_index() - self.assert_eq(pandas_result, pandas_on_spark_result) - self.assert_eq(pandas_result.index.names, pandas_on_spark_result.index.names) - self.assert_eq(pandas_result.columns.names, pandas_on_spark_result.columns.names) + koalas_result = kser.unstack(level=level).sort_index() + self.assert_eq(pandas_result, koalas_result) + self.assert_eq(pandas_result.index.names, koalas_result.index.names) + self.assert_eq(pandas_result.columns.names, koalas_result.columns.names) # non-numeric datatypes pser = pd.Series( list("abcd"), index=pd.MultiIndex.from_product([["one", "two"], ["a", "b"]]) ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) levels = [-2, -1, 0, 1] for level in levels: pandas_result = pser.unstack(level=level) - pandas_on_spark_result = psser.unstack(level=level).sort_index() - self.assert_eq(pandas_result, pandas_on_spark_result) - self.assert_eq(pandas_result.index.names, pandas_on_spark_result.index.names) - self.assert_eq(pandas_result.columns.names, pandas_on_spark_result.columns.names) + koalas_result = kser.unstack(level=level).sort_index() + self.assert_eq(pandas_result, koalas_result) + self.assert_eq(pandas_result.index.names, koalas_result.index.names) + self.assert_eq(pandas_result.columns.names, koalas_result.columns.names) # Exceeding the range of level - self.assertRaises(IndexError, lambda: psser.unstack(level=3)) - self.assertRaises(IndexError, lambda: psser.unstack(level=-4)) + self.assertRaises(IndexError, lambda: kser.unstack(level=3)) + self.assertRaises(IndexError, lambda: kser.unstack(level=-4)) # Only support for MultiIndex - psser = ps.Series([10, -2, 4, 7]) - self.assertRaises(ValueError, lambda: psser.unstack()) + kser = ps.Series([10, -2, 4, 7]) + self.assertRaises(ValueError, lambda: kser.unstack()) def test_item(self): - psser = ps.Series([10, 20]) - self.assertRaises(ValueError, lambda: psser.item()) + kser = ps.Series([10, 20]) + self.assertRaises(ValueError, lambda: kser.item()) def test_filter(self): pser = pd.Series([0, 1, 2], index=["one", "two", "three"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.filter(items=["one", "three"]), psser.filter(items=["one", "three"])) - self.assert_eq(pser.filter(regex="e$"), psser.filter(regex="e$")) - self.assert_eq(pser.filter(like="hre"), psser.filter(like="hre")) + self.assert_eq(pser.filter(items=["one", "three"]), kser.filter(items=["one", "three"])) + self.assert_eq(pser.filter(regex="e$"), kser.filter(regex="e$")) + self.assert_eq(pser.filter(like="hre"), kser.filter(like="hre")) with self.assertRaisesRegex(ValueError, "Series does not support columns axis."): - psser.filter(like="hre", axis=1) + kser.filter(like="hre", axis=1) # for MultiIndex midx = pd.MultiIndex.from_tuples([("one", "x"), ("two", "y"), ("three", "z")]) pser = pd.Series([0, 1, 2], index=midx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pser.filter(items=[("one", "x"), ("three", "z")]), - psser.filter(items=[("one", "x"), ("three", "z")]), + kser.filter(items=[("one", "x"), ("three", "z")]), ) with self.assertRaisesRegex(TypeError, "Unsupported type list"): - psser.filter(items=[["one", "x"], ("three", "z")]) + kser.filter(items=[["one", "x"], ("three", "z")]) with self.assertRaisesRegex(ValueError, "The item should not be empty."): - psser.filter(items=[(), ("three", "z")]) + kser.filter(items=[(), ("three", "z")]) def test_abs(self): pser = pd.Series([-2, -1, 0, 1]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(abs(psser), abs(pser)) - self.assert_eq(np.abs(psser), np.abs(pser)) + self.assert_eq(abs(kser), abs(pser)) + self.assert_eq(np.abs(kser), np.abs(pser)) def test_bfill(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.bfill(), pser.bfill()) - self.assert_eq(psser.bfill()[0], pser.bfill()[0]) + self.assert_eq(kser.bfill(), pser.bfill()) + self.assert_eq(kser.bfill()[0], pser.bfill()[0]) - psser.bfill(inplace=True) + kser.bfill(inplace=True) pser.bfill(inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psser[0], pser[0]) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kser[0], pser[0]) + self.assert_eq(kdf, pdf) def test_ffill(self): pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) pser = pdf.x - psser = psdf.x + kser = kdf.x - self.assert_eq(psser.ffill(), pser.ffill()) - self.assert_eq(psser.ffill()[4], pser.ffill()[4]) + self.assert_eq(kser.ffill(), pser.ffill()) + self.assert_eq(kser.ffill()[4], pser.ffill()[4]) - psser.ffill(inplace=True) + kser.ffill(inplace=True) pser.ffill(inplace=True) - self.assert_eq(psser, pser) - self.assert_eq(psser[4], pser[4]) - self.assert_eq(psdf, pdf) + self.assert_eq(kser, pser) + self.assert_eq(kser[4], pser[4]) + self.assert_eq(kdf, pdf) def test_iteritems(self): pser = pd.Series(["A", "B", "C"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - for (p_name, p_items), (k_name, k_items) in zip(pser.iteritems(), psser.iteritems()): + for (p_name, p_items), (k_name, k_items) in zip(pser.iteritems(), kser.iteritems()): self.assert_eq(p_name, k_name) self.assert_eq(p_items, k_items) @@ -2328,136 +2319,136 @@ def test_droplevel(self): names=["level_1", "level_2", "level_3"], ), ) - psser = ps.from_pandas(pser) - - self.assert_eq(pser.droplevel(0), psser.droplevel(0)) - self.assert_eq(pser.droplevel("level_1"), psser.droplevel("level_1")) - self.assert_eq(pser.droplevel(-1), psser.droplevel(-1)) - self.assert_eq(pser.droplevel([0]), psser.droplevel([0])) - self.assert_eq(pser.droplevel(["level_1"]), psser.droplevel(["level_1"])) - self.assert_eq(pser.droplevel((0,)), psser.droplevel((0,))) - self.assert_eq(pser.droplevel(("level_1",)), psser.droplevel(("level_1",))) - self.assert_eq(pser.droplevel([0, 2]), psser.droplevel([0, 2])) + kser = ps.from_pandas(pser) + + self.assert_eq(pser.droplevel(0), kser.droplevel(0)) + self.assert_eq(pser.droplevel("level_1"), kser.droplevel("level_1")) + self.assert_eq(pser.droplevel(-1), kser.droplevel(-1)) + self.assert_eq(pser.droplevel([0]), kser.droplevel([0])) + self.assert_eq(pser.droplevel(["level_1"]), kser.droplevel(["level_1"])) + self.assert_eq(pser.droplevel((0,)), kser.droplevel((0,))) + self.assert_eq(pser.droplevel(("level_1",)), kser.droplevel(("level_1",))) + self.assert_eq(pser.droplevel([0, 2]), kser.droplevel([0, 2])) self.assert_eq( - pser.droplevel(["level_1", "level_3"]), psser.droplevel(["level_1", "level_3"]) + pser.droplevel(["level_1", "level_3"]), kser.droplevel(["level_1", "level_3"]) ) - self.assert_eq(pser.droplevel((1, 2)), psser.droplevel((1, 2))) + self.assert_eq(pser.droplevel((1, 2)), kser.droplevel((1, 2))) self.assert_eq( - pser.droplevel(("level_2", "level_3")), psser.droplevel(("level_2", "level_3")) + pser.droplevel(("level_2", "level_3")), kser.droplevel(("level_2", "level_3")) ) with self.assertRaisesRegex(KeyError, "Level {0, 1, 2} not found"): - psser.droplevel({0, 1, 2}) + kser.droplevel({0, 1, 2}) with self.assertRaisesRegex(KeyError, "Level level_100 not found"): - psser.droplevel(["level_1", "level_100"]) + kser.droplevel(["level_1", "level_100"]) with self.assertRaisesRegex( IndexError, "Too many levels: Index has only 3 levels, not 11" ): - psser.droplevel(10) + kser.droplevel(10) with self.assertRaisesRegex( IndexError, "Too many levels: Index has only 3 levels, -10 is not a valid level number", ): - psser.droplevel(-10) + kser.droplevel(-10) with self.assertRaisesRegex( ValueError, "Cannot remove 3 levels from an index with 3 levels: " "at least one level must be left.", ): - psser.droplevel([0, 1, 2]) + kser.droplevel([0, 1, 2]) with self.assertRaisesRegex( ValueError, "Cannot remove 5 levels from an index with 3 levels: " "at least one level must be left.", ): - psser.droplevel([1, 1, 1, 1, 1]) + kser.droplevel([1, 1, 1, 1, 1]) # Tupled names pser.index.names = [("a", "1"), ("b", "2"), ("c", "3")] - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.droplevel([("a", "1"), ("c", "3")]), psser.droplevel([("a", "1"), ("c", "3")]) + pser.droplevel([("a", "1"), ("c", "3")]), kser.droplevel([("a", "1"), ("c", "3")]) ) def test_dot(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) - self.assert_eq((psdf["b"] * 10).dot(psdf["a"]), (pdf["b"] * 10).dot(pdf["a"])) - self.assert_eq((psdf["b"] * 10).dot(psdf), (pdf["b"] * 10).dot(pdf)) - self.assert_eq((psdf["b"] * 10).dot(psdf + 1), (pdf["b"] * 10).dot(pdf + 1)) + self.assert_eq((kdf["b"] * 10).dot(kdf["a"]), (pdf["b"] * 10).dot(pdf["a"])) + self.assert_eq((kdf["b"] * 10).dot(kdf), (pdf["b"] * 10).dot(pdf)) + self.assert_eq((kdf["b"] * 10).dot(kdf + 1), (pdf["b"] * 10).dot(pdf + 1)) def test_tail(self): pser = pd.Series(range(1000), name="Koalas") - psser = ps.from_pandas(pser) - - self.assert_eq(pser.tail(), psser.tail()) - self.assert_eq(pser.tail(10), psser.tail(10)) - self.assert_eq(pser.tail(-990), psser.tail(-990)) - self.assert_eq(pser.tail(0), psser.tail(0)) - self.assert_eq(pser.tail(1001), psser.tail(1001)) - self.assert_eq(pser.tail(-1001), psser.tail(-1001)) - self.assert_eq((pser + 1).tail(), (psser + 1).tail()) - self.assert_eq((pser + 1).tail(10), (psser + 1).tail(10)) - self.assert_eq((pser + 1).tail(-990), (psser + 1).tail(-990)) - self.assert_eq((pser + 1).tail(0), (psser + 1).tail(0)) - self.assert_eq((pser + 1).tail(1001), (psser + 1).tail(1001)) - self.assert_eq((pser + 1).tail(-1001), (psser + 1).tail(-1001)) + kser = ps.from_pandas(pser) + + self.assert_eq(pser.tail(), kser.tail()) + self.assert_eq(pser.tail(10), kser.tail(10)) + self.assert_eq(pser.tail(-990), kser.tail(-990)) + self.assert_eq(pser.tail(0), kser.tail(0)) + self.assert_eq(pser.tail(1001), kser.tail(1001)) + self.assert_eq(pser.tail(-1001), kser.tail(-1001)) + self.assert_eq((pser + 1).tail(), (kser + 1).tail()) + self.assert_eq((pser + 1).tail(10), (kser + 1).tail(10)) + self.assert_eq((pser + 1).tail(-990), (kser + 1).tail(-990)) + self.assert_eq((pser + 1).tail(0), (kser + 1).tail(0)) + self.assert_eq((pser + 1).tail(1001), (kser + 1).tail(1001)) + self.assert_eq((pser + 1).tail(-1001), (kser + 1).tail(-1001)) with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): - psser.tail("10") + kser.tail("10") def test_product(self): pser = pd.Series([10, 20, 30, 40, 50]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) # Containing NA values pser = pd.Series([10, np.nan, 30, np.nan, 50]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod(), almost=True) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod(), almost=True) # All-NA values pser = pd.Series([np.nan, np.nan, np.nan]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) # Empty Series pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) # Boolean Series pser = pd.Series([True, True, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) pser = pd.Series([False, False, False]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) pser = pd.Series([True, False, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(), kser.prod()) # With `min_count` parameter pser = pd.Series([10, 20, 30, 40, 50]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=5), psser.prod(min_count=5)) - self.assert_eq(pser.prod(min_count=6), psser.prod(min_count=6)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=5), kser.prod(min_count=5)) + self.assert_eq(pser.prod(min_count=6), kser.prod(min_count=6)) pser = pd.Series([10, np.nan, 30, np.nan, 50]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=3), psser.prod(min_count=3), almost=True) - self.assert_eq(pser.prod(min_count=4), psser.prod(min_count=4)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=3), kser.prod(min_count=3), almost=True) + self.assert_eq(pser.prod(min_count=4), kser.prod(min_count=4)) pser = pd.Series([np.nan, np.nan, np.nan]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1)) pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) + kser = ps.from_pandas(pser) + self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1)) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ps.Series(["a", "b", "c"]).prod() @@ -2469,26 +2460,26 @@ def test_product(self): def test_hasnans(self): # BooleanType pser = pd.Series([True, False, True, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) pser = pd.Series([True, False, np.nan, True]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) # TimestampType pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) + kser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) def test_last_valid_index(self): pser = pd.Series([250, 1.5, 320, 1, 0.3, None, None, None, None]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) # MultiIndex columns midx = pd.MultiIndex( @@ -2496,48 +2487,48 @@ def test_last_valid_index(self): [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser.index = midx - psser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) # Empty Series pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.last_valid_index(), psser.last_valid_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.last_valid_index(), kser.last_valid_index()) def test_first_valid_index(self): # Empty Series pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.first_valid_index(), psser.first_valid_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.first_valid_index(), kser.first_valid_index()) def test_factorize(self): pser = pd.Series(["a", "b", "a", "b"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([5, 1, 5, 1]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = (pser + 1).factorize(sort=True) - kcodes, kuniques = (psser + 1).factorize() + kcodes, kuniques = (kser + 1).factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series(["a", "b", "a", "b"], name="ser", index=["w", "x", "y", "z"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series( ["a", "b", "a", "b"], index=pd.MultiIndex.from_arrays([[4, 3, 2, 1], [1, 2, 3, 4]]) ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) @@ -2545,38 +2536,38 @@ def test_factorize(self): # Deals with None and np.nan # pser = pd.Series(["a", "b", "a", np.nan]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([1, None, 3, 2, 1]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series(["a", None, "a"]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True) - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pser = pd.Series([None, np.nan]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize() - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes, kcodes.to_list()) # pandas: Float64Index([], dtype='float64') self.assert_eq(pd.Index([]), kuniques) pser = pd.Series([np.nan, np.nan]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize() - kcodes, kuniques = psser.factorize() + kcodes, kuniques = kser.factorize() self.assert_eq(pcodes, kcodes.to_list()) # pandas: Float64Index([], dtype='float64') self.assert_eq(pd.Index([]), kuniques) @@ -2591,75 +2582,75 @@ def test_factorize(self): pd_below_0_24 = LooseVersion(pd.__version__) < LooseVersion("0.24") pser = pd.Series(["a", "b", "a", np.nan, None]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2) - kcodes, kuniques = psser.factorize(na_sentinel=-2) + kcodes, kuniques = kser.factorize(na_sentinel=-2) self.assert_eq([0, 1, 0, -2, -2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pcodes, puniques = pser.factorize(sort=True, na_sentinel=2) - kcodes, kuniques = psser.factorize(na_sentinel=2) + kcodes, kuniques = kser.factorize(na_sentinel=2) self.assert_eq([0, 1, 0, 2, 2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) if not pd_below_1_1_2: pcodes, puniques = pser.factorize(sort=True, na_sentinel=None) - kcodes, kuniques = psser.factorize(na_sentinel=None) + kcodes, kuniques = kser.factorize(na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) # puniques is Index(['a', 'b', nan], dtype='object') self.assert_eq(ps.Index(["a", "b", None]), kuniques) - psser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null - psser.loc[3] = np.nan # Spark takes np.nan as NaN - kcodes, kuniques = psser.factorize(na_sentinel=None) - pcodes, puniques = psser.to_pandas().factorize(sort=True, na_sentinel=None) + kser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null + kser.loc[3] = np.nan # Spark takes np.nan as NaN + kcodes, kuniques = kser.factorize(na_sentinel=None) + pcodes, puniques = kser.to_pandas().factorize(sort=True, na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) def test_pad(self): pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(pser.pad(), psser.pad()) + self.assert_eq(pser.pad(), kser.pad()) # Test `inplace=True` pser.pad(inplace=True) - psser.pad(inplace=True) - self.assert_eq(pser, psser) + kser.pad(inplace=True) + self.assert_eq(pser, kser) else: expected = ps.Series([np.nan, 2, 3, 4, 4, 6], name="x") - self.assert_eq(expected, psser.pad()) + self.assert_eq(expected, kser.pad()) # Test `inplace=True` - psser.pad(inplace=True) - self.assert_eq(expected, psser) + kser.pad(inplace=True) + self.assert_eq(expected, kser) def test_explode(self): if LooseVersion(pd.__version__) >= LooseVersion("0.25"): pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode(), almost=True) + kser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), kser.explode(), almost=True) # MultiIndex pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode(), almost=True) + kser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), kser.explode(), almost=True) # non-array type Series pser = pd.Series([1, 2, 3, 4]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), kser.explode()) else: pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) expected = pd.Series([1.0, 2.0, 3.0, None, None, 3.0, 4.0], index=[0, 0, 0, 1, 2, 3, 3]) - self.assert_eq(psser.explode(), expected) + self.assert_eq(kser.explode(), expected) # MultiIndex pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) expected = pd.Series( [1.0, 2.0, 3.0, None, None, 3.0, 4.0], index=pd.MultiIndex.from_tuples( @@ -2674,106 +2665,106 @@ def test_explode(self): ] ), ) - self.assert_eq(psser.explode(), expected) + self.assert_eq(kser.explode(), expected) # non-array type Series pser = pd.Series([1, 2, 3, 4]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) expected = pser - self.assert_eq(psser.explode(), expected) + self.assert_eq(kser.explode(), expected) def test_argsort(self): # Without null values pser = pd.Series([0, -100, 50, 100, 20], index=["A", "B", "C", "D", "E"]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # With name pser.name = "Koalas" - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # Series from Index pidx = pd.Index([4.0, -6.0, 2.0, -100.0, 11.0, 20.0, 1.0, -99.0]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() ) # Series from Index with name pidx.name = "Koalas" - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() ) # Series from DataFrame pdf = pd.DataFrame({"A": [4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]}) - psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.A.argsort().sort_index(), psdf.A.argsort().sort_index()) - self.assert_eq((-pdf.A).argsort().sort_index(), (-psdf.A).argsort().sort_index()) + kdf = ps.from_pandas(pdf) + self.assert_eq(pdf.A.argsort().sort_index(), kdf.A.argsort().sort_index()) + self.assert_eq((-pdf.A).argsort().sort_index(), (-kdf.A).argsort().sort_index()) # With null values pser = pd.Series([0, -100, np.nan, 100, np.nan], index=["A", "B", "C", "D", "E"]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # MultiIndex with null values pser.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # With name with null values pser.name = "Koalas" - psser = ps.from_pandas(pser) - self.assert_eq(pser.argsort().sort_index(), psser.argsort().sort_index()) - self.assert_eq((-pser).argsort().sort_index(), (-psser).argsort().sort_index()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argsort().sort_index(), kser.argsort().sort_index()) + self.assert_eq((-pser).argsort().sort_index(), (-kser).argsort().sort_index()) # Series from Index with null values pidx = pd.Index([4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]) - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() ) # Series from Index with name with null values pidx.name = "Koalas" - psidx = ps.from_pandas(pidx) + kidx = ps.from_pandas(pidx) self.assert_eq( - pidx.to_series().argsort().sort_index(), psidx.to_series().argsort().sort_index() + pidx.to_series().argsort().sort_index(), kidx.to_series().argsort().sort_index() ) self.assert_eq( - (-pidx.to_series()).argsort().sort_index(), (-psidx.to_series()).argsort().sort_index() + (-pidx.to_series()).argsort().sort_index(), (-kidx.to_series()).argsort().sort_index() ) # Series from DataFrame with null values pdf = pd.DataFrame({"A": [4.0, -6.0, 2.0, np.nan, -100.0, 11.0, 20.0, np.nan, 1.0, -99.0]}) - psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.A.argsort().sort_index(), psdf.A.argsort().sort_index()) - self.assert_eq((-pdf.A).argsort().sort_index(), (-psdf.A).argsort().sort_index()) + kdf = ps.from_pandas(pdf) + self.assert_eq(pdf.A.argsort().sort_index(), kdf.A.argsort().sort_index()) + self.assert_eq((-pdf.A).argsort().sort_index(), (-kdf.A).argsort().sort_index()) def test_argmin_argmax(self): pser = pd.Series( @@ -2787,34 +2778,34 @@ def test_argmin_argmax(self): }, name="Koalas", ) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(pser.argmin(), psser.argmin()) - self.assert_eq(pser.argmax(), psser.argmax()) + self.assert_eq(pser.argmin(), kser.argmin()) + self.assert_eq(pser.argmax(), kser.argmax()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argmin(), psser.argmin()) - self.assert_eq(pser.argmax(), psser.argmax()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.argmin(), kser.argmin()) + self.assert_eq(pser.argmax(), kser.argmax()) # Null Series self.assert_eq(pd.Series([np.nan]).argmin(), ps.Series([np.nan]).argmin()) self.assert_eq(pd.Series([np.nan]).argmax(), ps.Series([np.nan]).argmax()) else: - self.assert_eq(pser.values.argmin(), psser.argmin()) - self.assert_eq(pser.values.argmax(), psser.argmax()) + self.assert_eq(pser.values.argmin(), kser.argmin()) + self.assert_eq(pser.values.argmax(), kser.argmax()) # MultiIndex pser.index = pd.MultiIndex.from_tuples( [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.values.argmin(), psser.argmin()) - self.assert_eq(pser.values.argmax(), psser.argmax()) + kser = ps.from_pandas(pser) + self.assert_eq(pser.values.argmin(), kser.argmin()) + self.assert_eq(pser.values.argmax(), kser.argmax()) # Null Series self.assert_eq(-1, ps.Series([np.nan]).argmin()) @@ -2827,91 +2818,91 @@ def test_argmin_argmax(self): def test_backfill(self): pser = pd.Series([np.nan, 2, 3, 4, np.nan, 6], name="x") - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.1"): - self.assert_eq(pser.backfill(), psser.backfill()) + self.assert_eq(pser.backfill(), kser.backfill()) # Test `inplace=True` pser.backfill(inplace=True) - psser.backfill(inplace=True) - self.assert_eq(pser, psser) + kser.backfill(inplace=True) + self.assert_eq(pser, kser) else: expected = ps.Series([2.0, 2.0, 3.0, 4.0, 6.0, 6.0], name="x") - self.assert_eq(expected, psser.backfill()) + self.assert_eq(expected, kser.backfill()) # Test `inplace=True` - psser.backfill(inplace=True) - self.assert_eq(expected, psser) + kser.backfill(inplace=True) + self.assert_eq(expected, kser) def test_align(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - psdf = ps.from_pandas(pdf) + kdf = ps.from_pandas(pdf) for join in ["outer", "inner", "left", "right"]: for axis in [None, 0]: - psser_l, psser_r = psdf.a.align(psdf.b, join=join, axis=axis) + kser_l, kser_r = kdf.a.align(kdf.b, join=join, axis=axis) pser_l, pser_r = pdf.a.align(pdf.b, join=join, axis=axis) - self.assert_eq(psser_l, pser_l) - self.assert_eq(psser_r, pser_r) + self.assert_eq(kser_l, pser_l) + self.assert_eq(kser_r, pser_r) - psser_l, psdf_r = psdf.b.align(psdf[["b", "a"]], join=join, axis=axis) + kser_l, kdf_r = kdf.b.align(kdf[["b", "a"]], join=join, axis=axis) pser_l, pdf_r = pdf.b.align(pdf[["b", "a"]], join=join, axis=axis) - self.assert_eq(psser_l, pser_l) - self.assert_eq(psdf_r, pdf_r) + self.assert_eq(kser_l, pser_l) + self.assert_eq(kdf_r, pdf_r) - self.assertRaises(ValueError, lambda: psdf.a.align(psdf.b, axis=1)) + self.assertRaises(ValueError, lambda: kdf.a.align(kdf.b, axis=1)) def test_pow_and_rpow(self): pser = pd.Series([1, 2, np.nan]) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) - self.assert_eq(pser.pow(np.nan), psser.pow(np.nan)) - self.assert_eq(pser ** np.nan, psser ** np.nan) - self.assert_eq(pser.rpow(np.nan), psser.rpow(np.nan)) - self.assert_eq(1 ** pser, 1 ** psser) + self.assert_eq(pser.pow(np.nan), kser.pow(np.nan)) + self.assert_eq(pser ** np.nan, kser ** np.nan) + self.assert_eq(pser.rpow(np.nan), kser.rpow(np.nan)) + self.assert_eq(1 ** pser, 1 ** kser) def test_between_time(self): idx = pd.date_range("2018-04-09", periods=4, freq="1D20min") pser = pd.Series([1, 2, 3, 4], index=idx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - psser.between_time("0:15", "0:45").sort_index(), + kser.between_time("0:15", "0:45").sort_index(), ) pser.index.name = "ts" - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - psser.between_time("0:15", "0:45").sort_index(), + kser.between_time("0:15", "0:45").sort_index(), ) pser.index.name = "index" - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( pser.between_time("0:15", "0:45").sort_index(), - psser.between_time("0:15", "0:45").sort_index(), + kser.between_time("0:15", "0:45").sort_index(), ) def test_at_time(self): idx = pd.date_range("2018-04-09", periods=4, freq="1D20min") pser = pd.Series([1, 2, 3, 4], index=idx) - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), ) pser.index.name = "ts" - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), ) pser.index.name = "index" - psser = ps.from_pandas(pser) + kser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), kser.at_time("0:20").sort_index(), ) @@ -2920,8 +2911,7 @@ def test_at_time(self): try: import xmlrunner # type: ignore[import] - - testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) From 62d85185cd9b18d0257725c67d10f4aa8ec0686f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:52:49 -0700 Subject: [PATCH 23/30] Revert "[SPARK-35194][SQL][FOLLOWUP] Recover build error with Scala 2.13 on GA" This reverts commit 0a2edadedfbcae8b9ed6ff4f04ec9cd39bf50ba7. --- .../spark/sql/catalyst/optimizer/NestedColumnAliasing.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index e0e8f926019f6..cd7032d555992 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -146,8 +146,7 @@ object NestedColumnAliasing { val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap // A reference attribute can have multiple aliases for nested fields. - val attrToAliases = - AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2)).toSeq) + val attrToAliases = AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2))) plan match { case Project(projectList, child) => From b7176e367d79096c2b6cd809a21a05a61817c5f3 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:53:17 -0700 Subject: [PATCH 24/30] Revert "[SPARK-35194][SQL] Refactor nested column aliasing for readability" This reverts commit cf8e6d63a05b314c45f0d427de3f6889d99e8d3c. --- .../catalyst/expressions/AttributeMap.scala | 6 - .../catalyst/expressions/AttributeMap.scala | 6 - .../optimizer/NestedColumnAliasing.scala | 426 ++++++++---------- .../sql/catalyst/optimizer/Optimizer.scala | 4 +- .../optimizer/NestedColumnAliasingSuite.scala | 2 +- 5 files changed, 194 insertions(+), 250 deletions(-) diff --git a/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala index 189318acd8661..42b92d4593c77 100644 --- a/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala +++ b/sql/catalyst/src/main/scala-2.12/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala @@ -23,10 +23,6 @@ package org.apache.spark.sql.catalyst.expressions * of the name, or the expected nullability). */ object AttributeMap { - def apply[A](kvs: Map[Attribute, A]): AttributeMap[A] = { - new AttributeMap(kvs.map(kv => (kv._1.exprId, kv))) - } - def apply[A](kvs: Seq[(Attribute, A)]): AttributeMap[A] = { new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap) } @@ -41,8 +37,6 @@ class AttributeMap[A](val baseMap: Map[ExprId, (Attribute, A)]) override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2) - override def getOrElse[B1 >: A](k: Attribute, default: => B1): B1 = get(k).getOrElse(default) - override def contains(k: Attribute): Boolean = get(k).isDefined override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] = baseMap.values.toMap + kv diff --git a/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala index 77152918bf687..e6b53e3e6548f 100644 --- a/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala +++ b/sql/catalyst/src/main/scala-2.13/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala @@ -23,10 +23,6 @@ package org.apache.spark.sql.catalyst.expressions * of the name, or the expected nullability). */ object AttributeMap { - def apply[A](kvs: Map[Attribute, A]): AttributeMap[A] = { - new AttributeMap(kvs.map(kv => (kv._1.exprId, kv))) - } - def apply[A](kvs: Seq[(Attribute, A)]): AttributeMap[A] = { new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap) } @@ -41,8 +37,6 @@ class AttributeMap[A](val baseMap: Map[ExprId, (Attribute, A)]) override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2) - override def getOrElse[B1 >: A](k: Attribute, default: => B1): B1 = get(k).getOrElse(default) - override def contains(k: Attribute): Boolean = get(k).isDefined override def updated[B1 >: A](key: Attribute, value: B1): Map[Attribute, B1] = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index cd7032d555992..5b12667f4a884 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -17,151 +17,71 @@ package org.apache.spark.sql.catalyst.optimizer -import scala.collection.mutable - import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** - * This aims to handle a nested column aliasing pattern inside the [[ColumnPruning]] optimizer rule. - * If: - * - A [[Project]] or its child references nested fields - * - Not all of the fields in a nested attribute are used - * Then: - * - Substitute the nested field references with alias attributes - * - Add grandchild [[Project]]s transforming the nested fields to aliases - * - * Example 1: Project - * ------------------ - * Before: - * +- Project [concat_ws(s#0.a, s#0.b) AS concat_ws(s.a, s.b)#1] - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- LocalRelation , [s#0] - * After: - * +- Project [concat_ws(_extract_a#2, _extract_b#3) AS concat_ws(s.a, s.b)#1] - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- Project [s#0.a AS _extract_a#2, s#0.b AS _extract_b#3] - * +- LocalRelation , [s#0] - * - * Example 2: Project above Filter - * ------------------------------- - * Before: - * +- Project [s#0.a AS s.a#1] - * +- Filter (length(s#0.b) > 2) - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- LocalRelation , [s#0] - * After: - * +- Project [_extract_a#2 AS s.a#1] - * +- Filter (length(_extract_b#3) > 2) - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- Project [s#0.a AS _extract_a#2, s#0.b AS _extract_b#3] - * +- LocalRelation , [s#0] - * - * Example 3: Nested fields with referenced parents - * ------------------------------------------------ - * Before: - * +- Project [s#0.a AS s.a#1, s#0.a.a1 AS s.a.a1#2] - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- LocalRelation , [s#0] - * After: - * +- Project [_extract_a#3 AS s.a#1, _extract_a#3.name AS s.a.a1#2] - * +- GlobalLimit 5 - * +- LocalLimit 5 - * +- Project [s#0.a AS _extract_a#3] - * +- LocalRelation , [s#0] - * - * The schema of the datasource relation will be pruned in the [[SchemaPruning]] optimizer rule. + * This aims to handle a nested column aliasing pattern inside the `ColumnPruning` optimizer rule. + * If a project or its child references to nested fields, and not all the fields + * in a nested attribute are used, we can substitute them by alias attributes; then a project + * of the nested fields as aliases on the children of the child will be created. */ object NestedColumnAliasing { def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { /** * This pattern is needed to support [[Filter]] plan cases like - * [[Project]]->[[Filter]]->listed plan in [[canProjectPushThrough]] (e.g., [[Window]]). - * The reason why we don't simply add [[Filter]] in [[canProjectPushThrough]] is that + * [[Project]]->[[Filter]]->listed plan in `canProjectPushThrough` (e.g., [[Window]]). + * The reason why we don't simply add [[Filter]] in `canProjectPushThrough` is that * the optimizer can hit an infinite loop during the [[PushDownPredicates]] rule. */ - case Project(projectList, Filter(condition, child)) if - SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => - rewritePlanIfSubsetFieldsUsed( - plan, projectList ++ Seq(condition) ++ child.expressions, child.producedAttributes.toSeq) + case Project(projectList, Filter(condition, child)) + if SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => + val exprCandidatesToPrune = projectList ++ Seq(condition) ++ child.expressions + getAliasSubMap(exprCandidatesToPrune, child.producedAttributes.toSeq).map { + case (nestedFieldToAlias, attrToAliases) => + NestedColumnAliasing.replaceToAliases(plan, nestedFieldToAlias, attrToAliases) + } - case Project(projectList, child) if - SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => - rewritePlanIfSubsetFieldsUsed( - plan, projectList ++ child.expressions, child.producedAttributes.toSeq) + case Project(projectList, child) + if SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => + val exprCandidatesToPrune = projectList ++ child.expressions + getAliasSubMap(exprCandidatesToPrune, child.producedAttributes.toSeq).map { + case (nestedFieldToAlias, attrToAliases) => + NestedColumnAliasing.replaceToAliases(plan, nestedFieldToAlias, attrToAliases) + } case p if SQLConf.get.nestedSchemaPruningEnabled && canPruneOn(p) => - rewritePlanIfSubsetFieldsUsed( - plan, p.expressions, p.producedAttributes.toSeq) + val exprCandidatesToPrune = p.expressions + getAliasSubMap(exprCandidatesToPrune, p.producedAttributes.toSeq).map { + case (nestedFieldToAlias, attrToAliases) => + NestedColumnAliasing.replaceToAliases(p, nestedFieldToAlias, attrToAliases) + } case _ => None } - /** - * Rewrites a plan with aliases if only a subset of the nested fields are used. - */ - def rewritePlanIfSubsetFieldsUsed( - plan: LogicalPlan, - exprList: Seq[Expression], - exclusiveAttrs: Seq[Attribute]): Option[LogicalPlan] = { - val attrToExtractValues = getAttributeToExtractValues(exprList, exclusiveAttrs) - if (attrToExtractValues.isEmpty) { - None - } else { - Some(rewritePlanWithAliases(plan, attrToExtractValues)) - } - } - /** * Replace nested columns to prune unused nested columns later. */ - def rewritePlanWithAliases( + private def replaceToAliases( plan: LogicalPlan, - attributeToExtractValues: Map[Attribute, Seq[ExtractValue]]): LogicalPlan = { - // Each expression can contain multiple nested fields. - // Note that we keep the original names to deliver to parquet in a case-sensitive way. - // A new alias is created for each nested field. - // Implementation detail: we don't use mapValues, because it creates a mutable view. - val attributeToExtractValuesAndAliases = - attributeToExtractValues.map { case (attr, evSeq) => - val evAliasSeq = evSeq.map { ev => - val fieldName = ev match { - case g: GetStructField => g.extractFieldName - case g: GetArrayStructFields => g.field.name - } - ev -> Alias(ev, s"_extract_$fieldName")() - } - - attr -> evAliasSeq - } - - val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap - - // A reference attribute can have multiple aliases for nested fields. - val attrToAliases = AttributeMap(attributeToExtractValuesAndAliases.mapValues(_.map(_._2))) - - plan match { - case Project(projectList, child) => - Project( - getNewProjectList(projectList, nestedFieldToAlias), - replaceWithAliases(child, nestedFieldToAlias, attrToAliases)) - - // The operators reaching here are already guarded by [[canPruneOn]]. - case other => - replaceWithAliases(other, nestedFieldToAlias, attrToAliases) - } + nestedFieldToAlias: Map[ExtractValue, Alias], + attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = plan match { + case Project(projectList, child) => + Project( + getNewProjectList(projectList, nestedFieldToAlias), + replaceWithAliases(child, nestedFieldToAlias, attrToAliases)) + + // The operators reaching here was already guarded by `canPruneOn`. + case other => + replaceWithAliases(other, nestedFieldToAlias, attrToAliases) } /** - * Replace the [[ExtractValue]]s in a project list with aliased attributes. + * Return a replaced project list. */ def getNewProjectList( projectList: Seq[NamedExpression], @@ -173,15 +93,15 @@ object NestedColumnAliasing { } /** - * Replace the grandchildren of a plan with [[Project]]s of the nested fields as aliases, - * and replace the [[ExtractValue]] expressions with aliased attributes. + * Return a plan with new children replaced with aliases, and expressions replaced with + * aliased attributes. */ def replaceWithAliases( plan: LogicalPlan, nestedFieldToAlias: Map[ExtractValue, Alias], - attrToAliases: AttributeMap[Seq[Alias]]): LogicalPlan = { + attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = { plan.withNewChildren(plan.children.map { plan => - Project(plan.output.flatMap(a => attrToAliases.getOrElse(a, Seq(a))), plan) + Project(plan.output.flatMap(a => attrToAliases.getOrElse(a.exprId, Seq(a))), plan) }).transformExpressions { case f: ExtractValue if nestedFieldToAlias.contains(f) => nestedFieldToAlias(f).toAttribute @@ -189,7 +109,7 @@ object NestedColumnAliasing { } /** - * Returns true for operators on which we can prune nested columns. + * Returns true for those operators that we can prune nested column on it. */ private def canPruneOn(plan: LogicalPlan) = plan match { case _: Aggregate => true @@ -198,7 +118,7 @@ object NestedColumnAliasing { } /** - * Returns true for operators through which project can be pushed. + * Returns true for those operators that project can be pushed through. */ private def canProjectPushThrough(plan: LogicalPlan) = plan match { case _: GlobalLimit => true @@ -213,10 +133,9 @@ object NestedColumnAliasing { } /** - * Returns two types of expressions: - * - Root references that are individually accessed - * - [[GetStructField]] or [[GetArrayStructFields]] on top of other [[ExtractValue]]s - * or special expressions. + * Return root references that are individually accessed as a whole, and `GetStructField`s + * or `GetArrayStructField`s which on top of other `ExtractValue`s or special expressions. + * Check `SelectedField` to see which expressions should be listed here. */ private def collectRootReferenceAndExtractValue(e: Expression): Seq[Expression] = e match { case _: AttributeReference => Seq(e) @@ -230,55 +149,67 @@ object NestedColumnAliasing { } /** - * Creates a map from root [[Attribute]]s to non-redundant nested [[ExtractValue]]s. - * Nested field accessors of `exclusiveAttrs` are not considered in nested fields aliasing. + * Return two maps in order to replace nested fields to aliases. + * + * If `exclusiveAttrs` is given, any nested field accessors of these attributes + * won't be considered in nested fields aliasing. + * + * 1. ExtractValue -> Alias: A new alias is created for each nested field. + * 2. ExprId -> Seq[Alias]: A reference attribute has multiple aliases pointing it. */ - def getAttributeToExtractValues( - exprList: Seq[Expression], - exclusiveAttrs: Seq[Attribute]): Map[Attribute, Seq[ExtractValue]] = { - - val nestedFieldReferences = new mutable.ArrayBuffer[ExtractValue]() - val otherRootReferences = new mutable.ArrayBuffer[AttributeReference]() - exprList.foreach { e => - collectRootReferenceAndExtractValue(e).foreach { - case ev: ExtractValue => - if (ev.references.size == 1) { - nestedFieldReferences.append(ev) - } - case ar: AttributeReference => otherRootReferences.append(ar) + def getAliasSubMap(exprList: Seq[Expression], exclusiveAttrs: Seq[Attribute] = Seq.empty) + : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = { + val (nestedFieldReferences, otherRootReferences) = + exprList.flatMap(collectRootReferenceAndExtractValue).partition { + case _: ExtractValue => true + case _ => false } - } - val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences) - // Remove cosmetic variations when we group extractors by their references - nestedFieldReferences + // Note that when we group by extractors with their references, we should remove + // cosmetic variations. + val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences) + val aliasSub = nestedFieldReferences.asInstanceOf[Seq[ExtractValue]] .filter(!_.references.subsetOf(exclusiveAttrSet)) .groupBy(_.references.head.canonicalized.asInstanceOf[Attribute]) - .flatMap { case (attr: Attribute, nestedFields: Seq[ExtractValue]) => - // Remove redundant [[ExtractValue]]s if they share the same parent nest field. + .flatMap { case (attr, nestedFields: Seq[ExtractValue]) => + // Remove redundant `ExtractValue`s if they share the same parent nest field. // For example, when `a.b` and `a.b.c` are in project list, we only need to alias `a.b`. - // Because `a.b` requires all of the inner fields of `b`, we cannot prune `a.b.c`. + // We only need to deal with two `ExtractValue`: `GetArrayStructFields` and + // `GetStructField`. Please refer to the method `collectRootReferenceAndExtractValue`. val dedupNestedFields = nestedFields.filter { - // See [[collectExtractValue]]: we only need to deal with [[GetArrayStructFields]] and - // [[GetStructField]] case e @ (_: GetStructField | _: GetArrayStructFields) => val child = e.children.head nestedFields.forall(f => child.find(_.semanticEquals(f)).isEmpty) case _ => true - }.distinct + } + + // Each expression can contain multiple nested fields. + // Note that we keep the original names to deliver to parquet in a case-sensitive way. + val nestedFieldToAlias = dedupNestedFields.distinct.map { f => + val exprId = NamedExpression.newExprId + (f, Alias(f, s"_gen_alias_${exprId.id}")(exprId, Seq.empty, None)) + } // If all nested fields of `attr` are used, we don't need to introduce new aliases. - // By default, the [[ColumnPruning]] rule uses `attr` already. + // By default, ColumnPruning rule uses `attr` already. // Note that we need to remove cosmetic variations first, so we only count a // nested field once. - val numUsedNestedFields = dedupNestedFields.map(_.canonicalized).distinct - .map { nestedField => totalFieldNum(nestedField.dataType) }.sum - if (numUsedNestedFields < totalFieldNum(attr.dataType)) { - Some((attr, dedupNestedFields.toSeq)) + if (nestedFieldToAlias.nonEmpty && + dedupNestedFields.map(_.canonicalized) + .distinct + .map { nestedField => totalFieldNum(nestedField.dataType) } + .sum < totalFieldNum(attr.dataType)) { + Some(attr.exprId -> nestedFieldToAlias) } else { None } } + + if (aliasSub.isEmpty) { + None + } else { + Some((aliasSub.values.flatten.toMap, aliasSub.map(x => (x._1, x._2.map(_._2))))) + } } /** @@ -296,9 +227,31 @@ object NestedColumnAliasing { } /** - * This prunes unnecessary nested columns from [[Generate]], or [[Project]] -> [[Generate]] + * This prunes unnecessary nested columns from `Generate` and optional `Project` on top + * of it. */ object GeneratorNestedColumnAliasing { + // Partitions `attrToAliases` based on whether the attribute is in Generator's output. + private def aliasesOnGeneratorOutput( + attrToAliases: Map[ExprId, Seq[Alias]], + generatorOutput: Seq[Attribute]) = { + val generatorOutputExprId = generatorOutput.map(_.exprId) + attrToAliases.partition { k => + generatorOutputExprId.contains(k._1) + } + } + + // Partitions `nestedFieldToAlias` based on whether the attribute of nested field extractor + // is in Generator's output. + private def nestedFieldOnGeneratorOutput( + nestedFieldToAlias: Map[ExtractValue, Alias], + generatorOutput: Seq[Attribute]) = { + val generatorOutputSet = AttributeSet(generatorOutput) + nestedFieldToAlias.partition { pair => + pair._1.references.subsetOf(generatorOutputSet) + } + } + def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { // Either `nestedPruningOnExpressions` or `nestedSchemaPruningEnabled` is enabled, we // need to prune nested columns through Project and under Generate. The difference is @@ -308,100 +261,103 @@ object GeneratorNestedColumnAliasing { SQLConf.get.nestedSchemaPruningEnabled) && canPruneGenerator(g.generator) => // On top on `Generate`, a `Project` that might have nested column accessors. // We try to get alias maps for both project list and generator's children expressions. - val attrToExtractValues = NestedColumnAliasing.getAttributeToExtractValues( - projectList ++ g.generator.children, Seq.empty) - if (attrToExtractValues.isEmpty) { - return None - } - val generatorOutputSet = AttributeSet(g.qualifiedGeneratorOutput) - val (attrToExtractValuesOnGenerator, attrToExtractValuesNotOnGenerator) = - attrToExtractValues.partition { case (attr, _) => - attr.references.subsetOf(generatorOutputSet) } - - val pushedThrough = NestedColumnAliasing.rewritePlanWithAliases( - plan, attrToExtractValuesNotOnGenerator) - - // If the generator output is `ArrayType`, we cannot push through the extractor. - // It is because we don't allow field extractor on two-level array, - // i.e., attr.field when attr is a ArrayType(ArrayType(...)). - // Similarily, we also cannot push through if the child of generator is `MapType`. - g.generator.children.head.dataType match { - case _: MapType => return Some(pushedThrough) - case ArrayType(_: ArrayType, _) => return Some(pushedThrough) - case _ => - } + val exprsToPrune = projectList ++ g.generator.children + NestedColumnAliasing.getAliasSubMap(exprsToPrune).map { + case (nestedFieldToAlias, attrToAliases) => + val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) = + nestedFieldOnGeneratorOutput(nestedFieldToAlias, g.qualifiedGeneratorOutput) + val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) = + aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput) + + // Push nested column accessors through `Generator`. + // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. + val newChild = NestedColumnAliasing.replaceWithAliases(g, + nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator) + val pushedThrough = Project(NestedColumnAliasing + .getNewProjectList(projectList, nestedFieldsNotOnGenerator), newChild) + + // If the generator output is `ArrayType`, we cannot push through the extractor. + // It is because we don't allow field extractor on two-level array, + // i.e., attr.field when attr is a ArrayType(ArrayType(...)). + // Similarily, we also cannot push through if the child of generator is `MapType`. + g.generator.children.head.dataType match { + case _: MapType => return Some(pushedThrough) + case ArrayType(_: ArrayType, _) => return Some(pushedThrough) + case _ => + } - // Pruning on `Generator`'s output. We only process single field case. - // For multiple field case, we cannot directly move field extractor into - // the generator expression. A workaround is to re-construct array of struct - // from multiple fields. But it will be more complicated and may not worth. - // TODO(SPARK-34956): support multiple fields. - val nestedFieldsOnGenerator = attrToExtractValuesOnGenerator.values.flatten.toSet - if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.isEmpty) { - Some(pushedThrough) - } else { - // Only one nested column accessor. - // E.g., df.select(explode($"items").as("item")).select($"item.a") - val nestedFieldOnGenerator = nestedFieldsOnGenerator.head - pushedThrough match { - case p @ Project(_, newG: Generate) => - // Replace the child expression of `ExplodeBase` generator with - // nested column accessor. - // E.g., df.select(explode($"items").as("item")).select($"item.a") => - // df.select(explode($"items.a").as("item.a")) - val rewrittenG = newG.transformExpressions { - case e: ExplodeBase => - val extractor = nestedFieldOnGenerator.transformUp { - case _: Attribute => - e.child - case g: GetStructField => - ExtractValue(g.child, Literal(g.extractFieldName), SQLConf.get.resolver) + // Pruning on `Generator`'s output. We only process single field case. + // For multiple field case, we cannot directly move field extractor into + // the generator expression. A workaround is to re-construct array of struct + // from multiple fields. But it will be more complicated and may not worth. + // TODO(SPARK-34956): support multiple fields. + if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.isEmpty) { + pushedThrough + } else { + // Only one nested column accessor. + // E.g., df.select(explode($"items").as("item")).select($"item.a") + pushedThrough match { + case p @ Project(_, newG: Generate) => + // Replace the child expression of `ExplodeBase` generator with + // nested column accessor. + // E.g., df.select(explode($"items").as("item")).select($"item.a") => + // df.select(explode($"items.a").as("item.a")) + val rewrittenG = newG.transformExpressions { + case e: ExplodeBase => + val extractor = nestedFieldsOnGenerator.head._1.transformUp { + case _: Attribute => + e.child + case g: GetStructField => + ExtractValue(g.child, Literal(g.extractFieldName), SQLConf.get.resolver) + } + e.withNewChildren(Seq(extractor)) } - e.withNewChildren(Seq(extractor)) - } - // As we change the child of the generator, its output data type must be updated. - val updatedGeneratorOutput = rewrittenG.generatorOutput - .zip(rewrittenG.generator.elementSchema.toAttributes) - .map { case (oldAttr, newAttr) => - newAttr.withExprId(oldAttr.exprId).withName(oldAttr.name) - } - assert(updatedGeneratorOutput.length == rewrittenG.generatorOutput.length, - "Updated generator output must have the same length " + - "with original generator output.") - val updatedGenerate = rewrittenG.copy(generatorOutput = updatedGeneratorOutput) - - // Replace nested column accessor with generator output. - val attrExprIdsOnGenerator = attrToExtractValuesOnGenerator.keys.map(_.exprId).toSet - val updatedProject = p.withNewChildren(Seq(updatedGenerate)).transformExpressions { - case f: ExtractValue if nestedFieldsOnGenerator.contains(f) => - updatedGenerate.output - .find(a => attrExprIdsOnGenerator.contains(a.exprId)) - .getOrElse(f) - } - Some(updatedProject) + // As we change the child of the generator, its output data type must be updated. + val updatedGeneratorOutput = rewrittenG.generatorOutput + .zip(rewrittenG.generator.elementSchema.toAttributes) + .map { case (oldAttr, newAttr) => + newAttr.withExprId(oldAttr.exprId).withName(oldAttr.name) + } + assert(updatedGeneratorOutput.length == rewrittenG.generatorOutput.length, + "Updated generator output must have the same length " + + "with original generator output.") + val updatedGenerate = rewrittenG.copy(generatorOutput = updatedGeneratorOutput) + + // Replace nested column accessor with generator output. + p.withNewChildren(Seq(updatedGenerate)).transformExpressions { + case f: ExtractValue if nestedFieldsOnGenerator.contains(f) => + updatedGenerate.output + .find(a => attrToAliasesOnGenerator.contains(a.exprId)) + .getOrElse(f) + } - case other => - // We should not reach here. - throw new IllegalStateException(s"Unreasonable plan after optimization: $other") - } + case other => + // We should not reach here. + throw new IllegalStateException(s"Unreasonable plan after optimization: $other") + } + } } case g: Generate if SQLConf.get.nestedSchemaPruningEnabled && - canPruneGenerator(g.generator) => + canPruneGenerator(g.generator) => // If any child output is required by higher projection, we cannot prune on it even we // only use part of nested column of it. A required child output means it is referred // as a whole or partially by higher projection, pruning it here will cause unresolved // query plan. - NestedColumnAliasing.rewritePlanIfSubsetFieldsUsed( - plan, g.generator.children, g.requiredChildOutput) + NestedColumnAliasing.getAliasSubMap( + g.generator.children, g.requiredChildOutput).map { + case (nestedFieldToAlias, attrToAliases) => + // Defer updating `Generate.unrequiredChildIndex` to next round of `ColumnPruning`. + NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias, attrToAliases) + } case _ => None } /** - * Types of [[Generator]] on which we can prune nested fields. + * This is a while-list for pruning nested fields at `Generator`. */ def canPruneGenerator(g: Generator): Boolean = g match { case _: Explode => true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index c90f4bcdd2602..16e3e43356b9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -785,7 +785,7 @@ object ColumnPruning extends Rule[LogicalPlan] { p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices)) // prune unrequired nested fields from `Generate`. - case GeneratorNestedColumnAliasing(rewrittenPlan) => rewrittenPlan + case GeneratorNestedColumnAliasing(p) => p // Eliminate unneeded attributes from right side of a Left Existence Join. case j @ Join(_, right, LeftExistence(_), _, _) => @@ -819,7 +819,7 @@ object ColumnPruning extends Rule[LogicalPlan] { // Can't prune the columns on LeafNode case p @ Project(_, _: LeafNode) => p - case NestedColumnAliasing(rewrittenPlan) => rewrittenPlan + case NestedColumnAliasing(p) => p // for all other logical plans that inherits the output from it's children // Project over project is handled by the first case, skip it here. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala index 643974c9c707d..a856caa6781e8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala @@ -714,7 +714,7 @@ object NestedColumnAliasingSuite { def collectGeneratedAliases(query: LogicalPlan): ArrayBuffer[String] = { val aliases = ArrayBuffer[String]() query.transformAllExpressions { - case a @ Alias(_, name) if name.startsWith("_extract_") => + case a @ Alias(_, name) if name.startsWith("_gen_alias_") => aliases += name a } From 9a9f0ff35eabfdd7c512261bd2c087f885574af6 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:53:57 -0700 Subject: [PATCH 25/30] Revert "[SPARK-35535][SQL] New data source V2 API: LocalScan" This reverts commit e5ee38c71fcdca68838caa1e672ec06f5de5fc26. --- .../spark/sql/connector/read/LocalScan.java | 31 ------ .../datasources/v2/DataSourceV2Strategy.scala | 6 -- .../spark/sql/connector/LocalScanSuite.scala | 95 ------------------- 3 files changed, 132 deletions(-) delete mode 100644 sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala diff --git a/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java b/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java deleted file mode 100644 index 4573cf5bc2a28..0000000000000 --- a/sql/core/src/main/java/org/apache/spark/sql/connector/read/LocalScan.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.connector.read; - -import org.apache.spark.annotation.Experimental; -import org.apache.spark.sql.catalyst.InternalRow; - -/** - * A special Scan which will happen on Driver locally instead of Executors. - * - * @since 3.2.0 - */ -@Experimental -public interface LocalScan extends Scan { - InternalRow[] rows(); -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 7bbd7b4b07645..811f41832d159 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.toPrettySQL import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, StagingTableCatalog, SupportsNamespaces, SupportsPartitionManagement, SupportsWrite, Table, TableCapability, TableCatalog, TableChange} -import org.apache.spark.sql.connector.read.LocalScan import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.connector.write.V1Write import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -105,11 +104,6 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat tableIdentifier = None) withProjectAndFilter(project, filters, dsScan, needsUnsafeConversion = false) :: Nil - case PhysicalOperation(project, filters, - DataSourceV2ScanRelation(_, scan: LocalScan, output)) => - val localScanExec = LocalTableScanExec(output, scan.rows().toSeq) - withProjectAndFilter(project, filters, localScanExec, needsUnsafeConversion = false) :: Nil - case PhysicalOperation(project, filters, relation: DataSourceV2ScanRelation) => // projection and filters were already pushed down in the optimizer. // this uses PhysicalOperation to get the projection and ensure that if the batch scan does diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala deleted file mode 100644 index db71eeb75eae0..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/LocalScanSuite.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.connector - -import java.util - -import scala.collection.JavaConverters._ - -import org.apache.spark.sql.{QueryTest, Row} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.connector.catalog.{BasicInMemoryTableCatalog, Identifier, SupportsRead, Table, TableCapability} -import org.apache.spark.sql.connector.expressions.Transform -import org.apache.spark.sql.connector.read.{LocalScan, Scan, ScanBuilder} -import org.apache.spark.sql.execution.LocalTableScanExec -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -class LocalScanSuite extends QueryTest with SharedSparkSession { - override def beforeAll(): Unit = { - super.beforeAll() - spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") - spark.conf.set("spark.sql.catalog.testcat", classOf[TestLocalScanCatalog].getName) - sql("CREATE TABLE testcat.tbl(i int)") - } - - override def afterAll(): Unit = { - spark.conf.unset(SQLConf.DEFAULT_CATALOG.key) - spark.conf.unset("spark.sql.catalog.testcat") - super.afterAll() - } - - test("full scan") { - val df = spark.table("testcat.tbl") - assert(df.schema == TestLocalScanTable.schema) - - val localScan = df.queryExecution.executedPlan.collect { - case s: LocalTableScanExec => s - } - assert(localScan.length == 1) - checkAnswer(df, TestLocalScanTable.data.map(Row(_))) - } -} - -class TestLocalScanCatalog extends BasicInMemoryTableCatalog { - override def createTable( - ident: Identifier, - schema: StructType, - partitions: Array[Transform], - properties: util.Map[String, String]): Table = { - val table = new TestLocalScanTable(ident.toString) - tables.put(ident, table) - table - } -} - -object TestLocalScanTable { - val schema = new StructType().add("i", "int") - val data = Seq(1, 2, 3) -} - -class TestLocalScanTable(override val name: String) extends Table with SupportsRead { - override def schema(): StructType = TestLocalScanTable.schema - - override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava - - override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = - new TestLocalScanBuilder - - private class TestLocalScanBuilder extends ScanBuilder { - override def build(): Scan = new TestLocalScan - } - - private class TestLocalScan extends LocalScan { - override def rows(): Array[InternalRow] = TestLocalScanTable.data.map(InternalRow(_)).toArray - - override def readSchema(): StructType = TestLocalScanTable.schema - } -} From 324fedb359fe05c97c75db641fabcb5ea957f0a0 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 18:54:21 -0700 Subject: [PATCH 26/30] Revert "[SPARK-35559][TEST] Speed up one test in AdaptiveQueryExecSuite" This reverts commit 5fc105f70a50160954b593ac2c1da3010310255e. --- .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index b9515c038f0ec..454d3aa148a44 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1513,7 +1513,7 @@ class AdaptiveQueryExecSuite test("SPARK-34091: Batch shuffle fetch in AQE partition coalescing") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", - SQLConf.SHUFFLE_PARTITIONS.key -> "10", + SQLConf.SHUFFLE_PARTITIONS.key -> "10000", SQLConf.FETCH_SHUFFLE_BLOCKS_IN_BATCH.key -> "true") { withTable("t1") { spark.range(100).selectExpr("id + 1 as a").write.format("parquet").saveAsTable("t1") From d6e320c5ad1ea3d7f248525bf9e333875c0887f7 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 19:58:08 -0700 Subject: [PATCH 27/30] reduce # of predicate --- .../sql/execution/benchmark/BloomFilterBenchmark.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index d29c6253ab746..ff7e2d8bc880d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -45,7 +45,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df1 = spark.range(N).map(_ => Random.nextInt) - private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF + private val df2 = Seq.fill(1000 * 1000) {UUID.randomUUID().toString.replace("-", "")}.toDF private def writeORCBenchmark(): Unit = { withTempPath { dir => @@ -88,7 +88,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private def readORCBenchmarkForInSet(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.orc(path + "/withoutBF") @@ -96,7 +96,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") runBenchmark(s"ORC Read for IN set") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where(filter).noop() } @@ -161,7 +161,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - val samples = df2.sample(0.0000003, 128).select("value").as[String].collect() + val samples = df2.sample(0.000003, 128).select("value").as[String].collect() val filter = "value IN (" + samples.map ( x => s"'$x'").mkString(", ") + ")" df2.repartition(col("value")).sort(col("value")).write.parquet(path + "/withoutBF") @@ -171,7 +171,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .parquet(path + "/withBF") runBenchmark(s"Parquet Read for IN set") { - val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) .parquet(path + "/withoutBF").where(filter).noop() From d481ec1f0a874d1b42521111f9b8c37549cb0586 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Jun 2021 20:59:22 -0700 Subject: [PATCH 28/30] update test results --- .../BloomFilterBenchmark-jdk11-results.txt | 128 +++++++++++------- .../BloomFilterBenchmark-results.txt | 102 ++++++++------ 2 files changed, 139 insertions(+), 91 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt index 04dcc36033b3c..ad462da6db245 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk11-results.txt @@ -2,155 +2,179 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 20066 20165 139 5.0 200.7 1.0X -With bloom filter 22784 22937 217 4.4 227.8 0.9X +Without bloom filter 13568 13645 109 7.4 135.7 1.0X +With bloom filter 16116 16238 172 6.2 161.2 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1732 1777 64 57.7 17.3 1.0X -With bloom filter 1287 1295 10 77.7 12.9 1.3X +Without bloom filter 1572 1605 47 63.6 15.7 1.0X +With bloom filter 1343 1359 23 74.5 13.4 1.2X + + +================================================================================================ +ORC Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 51 63 15 19.6 51.1 1.0X +With bloom filter 54 88 23 18.5 54.0 0.9X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 18804 18870 93 5.3 188.0 1.0X -With bloom filter 23439 23667 323 4.3 234.4 0.8X +Without bloom filter 13679 13954 389 7.3 136.8 1.0X +With bloom filter 18260 18284 33 5.5 182.6 0.7X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 1550 1733 259 64.5 15.5 1.0X -With bloom filter, blocksize: 2097152 350 500 119 285.3 3.5 4.4X +Without bloom filter, blocksize: 2097152 954 984 49 104.8 9.5 1.0X +With bloom filter, blocksize: 2097152 285 307 21 350.4 2.9 3.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 1256 1339 117 79.6 12.6 1.0X -With bloom filter, blocksize: 3145728 333 388 35 300.0 3.3 3.8X +Without bloom filter, blocksize: 3145728 788 831 40 126.9 7.9 1.0X +With bloom filter, blocksize: 3145728 192 262 47 521.4 1.9 4.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 1240 1259 28 80.7 12.4 1.0X -With bloom filter, blocksize: 4194304 243 297 34 411.4 2.4 5.1X +Without bloom filter, blocksize: 4194304 787 847 75 127.0 7.9 1.0X +With bloom filter, blocksize: 4194304 201 224 18 496.4 2.0 3.9X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 1345 1380 49 74.3 13.5 1.0X -With bloom filter, blocksize: 5242880 352 433 55 284.1 3.5 3.8X +Without bloom filter, blocksize: 5242880 854 872 18 117.1 8.5 1.0X +With bloom filter, blocksize: 5242880 172 222 37 582.7 1.7 5.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 1272 1310 54 78.6 12.7 1.0X -With bloom filter, blocksize: 6291456 356 404 30 280.9 3.6 3.6X +Without bloom filter, blocksize: 6291456 785 813 27 127.4 7.9 1.0X +With bloom filter, blocksize: 6291456 167 188 14 598.0 1.7 4.7X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 1355 1410 78 73.8 13.5 1.0X -With bloom filter, blocksize: 8388608 603 632 25 165.8 6.0 2.2X +Without bloom filter, blocksize: 8388608 806 834 42 124.1 8.1 1.0X +With bloom filter, blocksize: 8388608 360 383 29 277.8 3.6 2.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 1315 1404 126 76.0 13.2 1.0X -With bloom filter, blocksize: 16777216 1161 1235 104 86.1 11.6 1.1X +Without bloom filter, blocksize: 16777216 812 846 42 123.2 8.1 1.0X +With bloom filter, blocksize: 16777216 780 807 27 128.2 7.8 1.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 1332 1394 89 75.1 13.3 1.0X -With bloom filter, blocksize: 33554432 1283 1338 77 77.9 12.8 1.0X +Without bloom filter, blocksize: 33554432 852 862 10 117.4 8.5 1.0X +With bloom filter, blocksize: 33554432 820 865 59 121.9 8.2 1.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 67108864 1323 1325 3 75.6 13.2 1.0X -With bloom filter, blocksize: 67108864 1320 1335 21 75.7 13.2 1.0X +Without bloom filter, blocksize: 67108864 844 911 58 118.5 8.4 1.0X +With bloom filter, blocksize: 67108864 851 853 2 117.5 8.5 1.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1046-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 134217728 1188 1221 48 84.2 11.9 1.0X -With bloom filter, blocksize: 134217728 1221 1248 39 81.9 12.2 1.0X +Without bloom filter, blocksize: 134217728 839 887 53 119.3 8.4 1.0X +With bloom filter, blocksize: 134217728 872 881 9 114.6 8.7 1.0X + + +================================================================================================ +Parquet Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 70 76 6 14.2 70.2 1.0X +With bloom filter 73 103 22 13.8 72.6 1.0X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index ba5c699aeaa99..5faf31841866c 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,155 +2,179 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 19398 19534 192 5.2 194.0 1.0X -With bloom filter 24950 24986 51 4.0 249.5 0.8X +Without bloom filter 15800 15864 90 6.3 158.0 1.0X +With bloom filter 18447 18451 6 5.4 184.5 0.9X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 1554 1604 71 64.4 15.5 1.0X -With bloom filter 1218 1240 32 82.1 12.2 1.3X +Without bloom filter 1543 1562 26 64.8 15.4 1.0X +With bloom filter 1145 1163 25 87.4 11.4 1.3X + + +================================================================================================ +ORC Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 57 71 16 17.6 56.8 1.0X +With bloom filter 54 63 12 18.4 54.2 1.0X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 17246 17265 27 5.8 172.5 1.0X -With bloom filter 26679 26680 1 3.7 266.8 0.6X +Without bloom filter 14024 14315 412 7.1 140.2 1.0X +With bloom filter 22622 22681 84 4.4 226.2 0.6X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 956 975 18 104.7 9.6 1.0X -With bloom filter, blocksize: 2097152 281 301 13 355.5 2.8 3.4X +Without bloom filter, blocksize: 2097152 802 826 23 124.7 8.0 1.0X +With bloom filter, blocksize: 2097152 241 257 12 414.7 2.4 3.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 3145728 844 858 13 118.5 8.4 1.0X -With bloom filter, blocksize: 3145728 216 224 8 463.9 2.2 3.9X +Without bloom filter, blocksize: 3145728 786 794 8 127.3 7.9 1.0X +With bloom filter, blocksize: 3145728 191 203 11 523.1 1.9 4.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 845 860 14 118.3 8.4 1.0X -With bloom filter, blocksize: 4194304 210 223 15 476.1 2.1 4.0X +Without bloom filter, blocksize: 4194304 786 790 5 127.3 7.9 1.0X +With bloom filter, blocksize: 4194304 179 191 10 559.5 1.8 4.4X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 5242880 837 845 8 119.5 8.4 1.0X -With bloom filter, blocksize: 5242880 228 236 14 438.9 2.3 3.7X +Without bloom filter, blocksize: 5242880 775 787 18 129.0 7.8 1.0X +With bloom filter, blocksize: 5242880 171 181 13 584.9 1.7 4.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 846 853 12 118.3 8.5 1.0X -With bloom filter, blocksize: 6291456 273 284 11 365.7 2.7 3.1X +Without bloom filter, blocksize: 6291456 767 777 9 130.3 7.7 1.0X +With bloom filter, blocksize: 6291456 220 233 12 454.3 2.2 3.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 843 852 8 118.7 8.4 1.0X -With bloom filter, blocksize: 8388608 376 397 18 266.0 3.8 2.2X +Without bloom filter, blocksize: 8388608 782 799 17 127.9 7.8 1.0X +With bloom filter, blocksize: 8388608 344 356 17 291.1 3.4 2.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 881 886 4 113.4 8.8 1.0X -With bloom filter, blocksize: 16777216 759 760 1 131.7 7.6 1.2X +Without bloom filter, blocksize: 16777216 800 811 14 125.0 8.0 1.0X +With bloom filter, blocksize: 16777216 628 646 12 159.2 6.3 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 861 870 8 116.1 8.6 1.0X -With bloom filter, blocksize: 33554432 886 901 13 112.9 8.9 1.0X +Without bloom filter, blocksize: 33554432 802 813 10 124.7 8.0 1.0X +With bloom filter, blocksize: 33554432 817 833 15 122.4 8.2 1.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 67108864 866 872 8 115.4 8.7 1.0X -With bloom filter, blocksize: 67108864 867 871 4 115.3 8.7 1.0X +Without bloom filter, blocksize: 67108864 798 816 23 125.3 8.0 1.0X +With bloom filter, blocksize: 67108864 810 823 12 123.4 8.1 1.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1046-azure +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 134217728 866 890 20 115.4 8.7 1.0X -With bloom filter, blocksize: 134217728 867 878 12 115.3 8.7 1.0X +Without bloom filter, blocksize: 134217728 831 845 13 120.4 8.3 1.0X +With bloom filter, blocksize: 134217728 799 821 20 125.1 8.0 1.0X + + +================================================================================================ +Parquet Read for IN set +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.4.0-1047-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Read a row from 1M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Without bloom filter 86 99 18 11.6 86.5 1.0X +With bloom filter 85 95 14 11.8 84.9 1.0X From 63edef3cd5024e3427909aecfb1790ca81ab2b53 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 10 Jun 2021 22:19:14 -0700 Subject: [PATCH 29/30] address comment --- .../benchmark/BloomFilterBenchmark.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index ff7e2d8bc880d..7ea1c10c8b0f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -45,13 +45,13 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { private val N = scaleFactor * 1000 * 1000 private val df1 = spark.range(N).map(_ => Random.nextInt) - private val df2 = Seq.fill(1000 * 1000) {UUID.randomUUID().toString.replace("-", "")}.toDF + private val df2 = Seq.fill(N) {UUID.randomUUID().toString.replace("-", "")}.toDF private def writeORCBenchmark(): Unit = { withTempPath { dir => val path = dir.getCanonicalPath - runBenchmark(s"ORC Write") { + runBenchmark("ORC Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => df1.write.mode("overwrite").orc(path + "/withoutBF") @@ -72,7 +72,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { df1.write.orc(path + "/withoutBF") df1.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") - runBenchmark(s"ORC Read") { + runBenchmark("ORC Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where("value = 0").noop() @@ -95,8 +95,8 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { df2.repartition(col("value")).sort(col("value")) .write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") - runBenchmark(s"ORC Read for IN set") { - val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) + runBenchmark("ORC Read for IN set") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.orc(path + "/withoutBF").where(filter).noop() } @@ -112,7 +112,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - runBenchmark(s"Parquet Write") { + runBenchmark("Parquet Write") { val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => df1.write.mode("overwrite").parquet(path + "/withoutBF") @@ -142,7 +142,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .option("parquet.block.size", blocksize) .parquet(path + "/withBF") - runBenchmark(s"Parquet Read") { + runBenchmark("Parquet Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter, blocksize: " + blocksize) { _ => spark.read.parquet(path + "/withoutBF").where("value = 0").noop() @@ -170,8 +170,8 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .option("parquet.bloom.filter.expected.ndv#value", "100000000") .parquet(path + "/withBF") - runBenchmark(s"Parquet Read for IN set") { - val benchmark = new Benchmark(s"Read a row from 1M rows", 1000 * 1000, output = output) + runBenchmark("Parquet Read for IN set") { + new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) .parquet(path + "/withoutBF").where(filter).noop() From 82e1e8ed22b968e846c44fe6eb1c48a5cd219822 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 10 Jun 2021 22:30:10 -0700 Subject: [PATCH 30/30] fix error --- .../spark/sql/execution/benchmark/BloomFilterBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 7ea1c10c8b0f8..1beb766dd3d20 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -171,7 +171,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { .parquet(path + "/withBF") runBenchmark("Parquet Read for IN set") { - new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => spark.read.option("spark.sql.parquet.pushdown.inFilterThreshold", 50) .parquet(path + "/withoutBF").where(filter).noop()