Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ protected QueryRunner createQueryRunner()
// Below are required to enable caching on metastore
.put("hive.metastore-cache-ttl", "1d")
.put("hive.metastore-refresh-interval", "1d")
// This is required to reduce memory pressure to test writing large files
.put("hive.s3.streaming.part-size", "5MB")
.buildOrThrow());
}

Expand Down Expand Up @@ -244,6 +246,41 @@ public void testFlushPartitionCache()
computeActual(format("DROP TABLE %s", fullyQualifiedTestTableName));
}

@Test
public void testWriteLargeFiles()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand how a single test adds coverage for all 3 cases you mention in the commit message?
It would be worthy to add a comment in the code itself to explain what is happening. And also (if possible) an assertion which would validate that you tested exactly the case you wanted to.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed comments. This query writes three files whose sizes fall into the different ranges.

Added assertion for the file sizes, and a comment explanation.

{
String testTable = getTestTableName();
computeActual(format(
"CREATE TABLE %s (" +
" col1 varchar, " +
" col2 varchar, " +
" regionkey bigint) " +
" WITH (partitioned_by=ARRAY['regionkey'])",
testTable));
String largeRowExpression = "array_join(transform(sequence(1, 70), x-> array_join(repeat(comment, 1000), '')), '')";
computeActual(format("INSERT INTO " + testTable + " SELECT %s, %s, regionkey FROM tpch.tiny.nation WHERE regionkey = 1", largeRowExpression, largeRowExpression));

// Exercise different code paths of Hive S3 streaming upload, with upload part size 5MB:
// 1. fileSize <= 5MB (direct upload)
// 2. 5MB < fileSize <= 10MB (upload in two parts)
// 3. fileSize > 10MB (upload in three or more parts)
query(format("SELECT DISTINCT (\"$file_size\" - 1) / (5 * 1024 * 1024) FROM %s ORDER BY 1", testTable))
.assertThat()
.skippingTypesCheck()
.containsAll(resultBuilder(getSession())
.row(0L)
.row(1L)
.row(2L)
.build());
query(format("SELECT SUM(LENGTH(col1)) FROM %s", testTable))
.assertThat()
.skippingTypesCheck()
.containsAll(resultBuilder(getSession())
.row(500L * 70 * 1000)
.build());
computeActual(format("DROP TABLE %s", testTable));
}

private void renamePartitionResourcesOutsideTrino(String tableName, String partitionColumn, String regionKey)
{
String partitionName = format("%s=%s", partitionColumn, regionKey);
Expand Down