Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.math.LongMath;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalog.Column;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
Expand Down Expand Up @@ -280,23 +280,23 @@ private static PartitionSpec identitySpec(Schema schema, List<String> partitionN
}

/**
* estimate approximate table size based on spark schema and total records.
* Estimate approximate table size based on Spark schema and total records.
*
* @param tableSchema spark schema
* @param tableSchema Spark schema
* @param totalRecords total records in the table
* @return approxiate size based on table schema
* @return approximate size based on table schema
*/
public static long estimateSize(StructType tableSchema, long totalRecords) {
if (totalRecords == Long.MAX_VALUE) {
return totalRecords;
}

long approximateSize = 0;
for (StructField sparkField : tableSchema.fields()) {
approximateSize += sparkField.dataType().defaultSize();
long result;
try {
result = LongMath.checkedMultiply(tableSchema.defaultSize(), totalRecords);
} catch (ArithmeticException e) {
result = Long.MAX_VALUE;
Comment on lines -294 to +298
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

StructType has a defaultSize method; there is no need to re-implement it. The main utility of the estimateSize static method is checking for overflow; we should just use Guava's LongMath.checkedMultiply.

}

long result = approximateSize * totalRecords;
return result > 0 ? result : Long.MAX_VALUE;
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -207,22 +207,20 @@ public Statistics estimateStatistics() {
LOG.debug("using table metadata to estimate table statistics");
long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(),
SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
Schema projectedSchema = expectedSchema != null ? expectedSchema : table.schema();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think expectedSchema is ever null. Thus, I think we can just call readSchema() to get the StructType.

return new Stats(
SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(projectedSchema), totalRecords),
SparkSchemaUtil.estimateSize(readSchema(), totalRecords),
totalRecords);
}

long sizeInBytes = 0L;
long numRows = 0L;

for (CombinedScanTask task : tasks()) {
for (FileScanTask file : task.files()) {
sizeInBytes += file.length();
numRows += file.file().recordCount();
}
}

long sizeInBytes = SparkSchemaUtil.estimateSize(readSchema(), numRows);
return new Stats(sizeInBytes, numRows);
}

Expand Down