-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
126 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
18 changes: 18 additions & 0 deletions
18
.../datalayout/src/main/java/com/linkedin/openhouse/datalayout/datasource/PartitionStat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package com.linkedin.openhouse.datalayout.datasource; | ||
|
||
import java.io.Serializable; | ||
import java.util.List; | ||
import lombok.AllArgsConstructor; | ||
import lombok.Builder; | ||
import lombok.Data; | ||
import lombok.NoArgsConstructor; | ||
|
||
/** Represents table partition. */ | ||
@Data | ||
@Builder | ||
@NoArgsConstructor | ||
@AllArgsConstructor | ||
public class PartitionStat implements Serializable { | ||
private List<String> values; | ||
private int fileCount; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
...ayout/src/main/java/com/linkedin/openhouse/datalayout/datasource/TablePartitionStats.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package com.linkedin.openhouse.datalayout.datasource; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import lombok.Builder; | ||
import org.apache.spark.api.java.function.MapFunction; | ||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Encoders; | ||
import org.apache.spark.sql.Row; | ||
import org.apache.spark.sql.SparkSession; | ||
import org.apache.spark.sql.types.StructType; | ||
|
||
/** Data source implementation for table partition statistics. */ | ||
@Builder | ||
public class TablePartitionStats implements DataSource<PartitionStat> { | ||
private final SparkSession spark; | ||
private final String tableName; | ||
|
||
/** | ||
* Get partition statistics dataset for the table. | ||
* | ||
* @return Dataset of partition statistics sorted by partition values. | ||
*/ | ||
@Override | ||
public Dataset<PartitionStat> get() { | ||
StructType partitionSchema = | ||
spark.sql(String.format("SELECT * FROM %s.partitions", tableName)).schema(); | ||
try { | ||
partitionSchema.apply("partition"); | ||
return spark | ||
.sql( | ||
String.format( | ||
"SELECT partition, file_count FROM %s.partitions ORDER BY partition", tableName)) | ||
.map(new TablePartitionStats.PartitionStatMapper(), Encoders.bean(PartitionStat.class)); | ||
} catch (IllegalArgumentException e) { | ||
return spark | ||
.sql(String.format("SELECT null, file_count FROM %s.partitions", tableName)) | ||
.map(new TablePartitionStats.PartitionStatMapper(), Encoders.bean(PartitionStat.class)); | ||
} | ||
} | ||
|
||
static class PartitionStatMapper implements MapFunction<Row, PartitionStat> { | ||
@Override | ||
public PartitionStat call(Row row) { | ||
List<String> values = new ArrayList<>(); | ||
Row partition = row.getStruct(0); | ||
if (partition != null) { | ||
for (int i = 0; i < partition.size(); i++) { | ||
values.add(partition.get(i).toString()); | ||
} | ||
} | ||
return PartitionStat.builder().values(values).fileCount(row.getInt(1)).build(); | ||
} | ||
} | ||
} |
50 changes: 50 additions & 0 deletions
50
...t/src/test/java/com/linkedin/openhouse/datalayout/datasource/TablePartitionStatsTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package com.linkedin.openhouse.datalayout.datasource; | ||
|
||
import com.linkedin.openhouse.tablestest.OpenHouseSparkITest; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import org.apache.spark.sql.SparkSession; | ||
import org.junit.jupiter.api.Assertions; | ||
import org.junit.jupiter.api.Test; | ||
|
||
public class TablePartitionStatsTest extends OpenHouseSparkITest { | ||
@Test | ||
public void testPartitionedTablePartitionStats() throws Exception { | ||
final String testTable = "db.test_table_partition_stats_partitioned"; | ||
try (SparkSession spark = getSparkSession()) { | ||
spark.sql("USE openhouse"); | ||
spark.sql( | ||
String.format( | ||
"CREATE TABLE %s (id INT, data STRING, dt STRING) PARTITIONED BY (dt, id)", | ||
testTable)); | ||
spark.sql(String.format("INSERT INTO %s VALUES (0, '0', '2024-01-01')", testTable)); | ||
spark.sql(String.format("INSERT INTO %s VALUES (1, '1', '2024-01-02')", testTable)); | ||
spark.sql(String.format("INSERT INTO %s VALUES (1, '2', '2024-01-02')", testTable)); | ||
TablePartitionStats tablePartitionStats = | ||
TablePartitionStats.builder().spark(spark).tableName(testTable).build(); | ||
List<PartitionStat> stats = tablePartitionStats.get().collectAsList(); | ||
Assertions.assertEquals(2, stats.size()); | ||
Assertions.assertEquals(Arrays.asList("2024-01-01", "0"), stats.get(0).getValues()); | ||
Assertions.assertEquals(1, stats.get(0).getFileCount()); | ||
Assertions.assertEquals(Arrays.asList("2024-01-02", "1"), stats.get(1).getValues()); | ||
Assertions.assertEquals(2, stats.get(1).getFileCount()); | ||
} | ||
} | ||
|
||
@Test | ||
public void testNonPartitionedTablePartitionStats() throws Exception { | ||
final String testTable = "db.test_table_partition_stats_non_partitioned"; | ||
try (SparkSession spark = getSparkSession()) { | ||
spark.sql("USE openhouse"); | ||
spark.sql(String.format("CREATE TABLE %s (id INT, data STRING)", testTable)); | ||
spark.sql(String.format("INSERT INTO %s VALUES (0, '0')", testTable)); | ||
spark.sql(String.format("INSERT INTO %s VALUES (1, '1')", testTable)); | ||
TablePartitionStats tablePartitionStats = | ||
TablePartitionStats.builder().spark(spark).tableName(testTable).build(); | ||
List<PartitionStat> stats = tablePartitionStats.get().collectAsList(); | ||
Assertions.assertEquals(1, stats.size()); | ||
Assertions.assertTrue(stats.get(0).getValues().isEmpty()); | ||
Assertions.assertEquals(2, stats.get(0).getFileCount()); | ||
} | ||
} | ||
} |