-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Closed
vinay-kl/trino-fork
#2Labels
bugSomething isn't workingSomething isn't working
Description
Hello Team,
Tables created using databricks shallow clone isn’t able to be read using Trino, The metadata JSON in this case contains absolute paths of parquet files instead of relative default ones
Normal Default table created using DBR
trino> show create table delta_prod.dev_7.test_table_2;
Create Table
-----------------------------------------------------------------------------------------------------------------
CREATE TABLE delta_prod.dev_7.test_table_2 (
id bigint,
name varchar
)
WITH (
location = 'abfs://[email protected]/prod-data/dev_7.db/test_table_2',
partitioned_by = ARRAY[]
)
Json Metadata
{"commitInfo":{"timestamp":1680790739007,"userId":"6912325007827228","userName":"abc@acom","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"349490711203691"},"clusterId":"1021-063628-phfe59zs","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"828"},"engineInfo":"Databricks-Runtime/12.2.x-scala2.12","txnId":"57bbb927-0cb9-4072-9353-6cc54e91ce9c"}}
{"add":{"path":"part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet","partitionValues":{},"size":828,"modificationTime":1680790738000,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"id\":1,\"name\":\"1\"},\"maxValues\":{\"id\":3,\"name\":\"3\"},\"nullCount\":{\"id\":0,\"name\":0}}","tags":{"INSERTION_TIME":"1680790738000000","MIN_INSERTION_TIME":"1680790738000000","MAX_INSERTION_TIME":"1680790738000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
Shadow Cloned Table
trino> show create table delta_prod.dev_7.test_table_3;
Create Table
-----------------------------------------------------------------------------------------------------------------
CREATE TABLE delta_prod.dev_7.test_table_3 (
id bigint,
name varchar
)
WITH (
location = 'abfs://[email protected]/prod-data/dev_7.db/test_table_3',
partitioned_by = ARRAY[]
)
Shadow Cloned Table's JSON metadata
{"commitInfo":{"timestamp":1680790786150,"userId":"6912325007827228","userName":"[email protected]","operation":"CLONE","operationParameters":{"source":"spark_catalog.dev_7.test_table_2","sourceVersion":1,"isShallow":true},"notebook":{"notebookId":"349490711203691"},"clusterId":"1021-063628-phfe59zs","readVersion":-1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"removedFilesSize":"0","numRemovedFiles":"0","sourceTableSize":"828","numCopiedFiles":"0","copiedFilesSize":"0","sourceNumOfFiles":"1"},"engineInfo":"Databricks-Runtime/12.2.x-scala2.12","txnId":"9da1ef62-53f2-4570-9393-4d236fde2de2"}}
{"metaData":{"id":"4a33a213-ffbf-4204-9136-7b829f2d652d","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1680790727656}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"add":{"path":"abfs://[email protected]/prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet","partitionValues":{},"size":828,"modificationTime":1680790738000,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"id\":1,\"name\":\"1\"},\"maxValues\":{\"id\":3,\"name\":\"3\"},\"nullCount\":{\"id\":0,\"name\":0}}","tags":{"INSERTION_TIME":"1680790738000000","MIN_INSERTION_TIME":"1680790738000000","MAX_INSERTION_TIME":"1680790738000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
This is the error being faced when read using Trino v412
trino> select * from delta_prod.dev_7.test_table_3;
Query 20230412_091513_00000_ep2tb, FAILED, 1 node
Splits: 1 total, 0 done (0.00%)
CPU Time: 0.0s total, 0 rows/s, 0B/s, 23% active
Per Node: 0.0 parallelism, 0 rows/s, 0B/s
Parallelism: 0.0
Peak Memory: 88B
0.64 [0 rows, 0B] [0 rows/s, 0B/s]
Query 20230412_091513_00000_ep2tb failed: Error opening Hive split abfs://[email protected]/prod-data/dev_7.db/test_table_3//prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet (offset=0, length=828): HEAD https://account.dfs.core.windows.net/container/prod-data/dev_7.db/test_table_3/prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet?timeout=90
StatusCode=404
StatusDescription=The specified path does not exist.
ErrorCode=
ErrorMessage=
io.trino.spi.TrinoException: Error opening Hive split abfs://[email protected]/prod-data/dev_7.db/test_table_3//prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet (offset=0, length=828): HEAD https://account.dfs.core.windows.net/container/prod-data/dev_7.db/test_table_3/prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet?timeout=90
StatusCode=404
StatusDescription=The specified path does not exist.
ErrorCode=
ErrorMessage=
at io.trino.plugin.hive.parquet.ParquetPageSourceFactory.createPageSource(ParquetPageSourceFactory.java:312)
at io.trino.plugin.deltalake.DeltaLakePageSourceProvider.createPageSource(DeltaLakePageSourceProvider.java:203)
at io.trino.plugin.base.classloader.ClassLoaderSafeConnectorPageSourceProvider.createPageSource(ClassLoaderSafeConnectorPageSourceProvider.java:49)
at io.trino.split.PageSourceManager.createPageSource(PageSourceManager.java:62)
at io.trino.operator.TableScanOperator.getOutput(TableScanOperator.java:298)
at io.trino.operator.Driver.processInternal(Driver.java:402)
at io.trino.operator.Driver.lambda$process$8(Driver.java:305)
at io.trino.operator.Driver.tryWithLock(Driver.java:701)
at io.trino.operator.Driver.process(Driver.java:297)
at io.trino.operator.Driver.processForDuration(Driver.java:268)
at io.trino.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:845)
at io.trino.execution.executor.PrioritizedSplitRunner.process(PrioritizedSplitRunner.java:165)
at io.trino.execution.executor.TaskExecutor$TaskRunner.run(TaskExecutor.java:537)
at io.trino.$gen.Trino_412____20230406_092526_2.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.FileNotFoundException: HEAD https://account.dfs.core.windows.net/container/prod-data/dev_7.db/test_table_3/prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet?timeout=90
StatusCode=404
StatusDescription=The specified path does not exist.
ErrorCode=
ErrorMessage=
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.checkException(AzureBlobFileSystem.java:926)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.open(AzureBlobFileSystem.java:177)
at io.trino.hdfs.TrinoFileSystemCache$FileSystemWrapper.open(TrinoFileSystemCache.java:393)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:906)
at io.trino.filesystem.hdfs.HdfsInputFile.lambda$openFile$1(HdfsInputFile.java:108)
at io.trino.hdfs.authentication.NoHdfsAuthentication.doAs(NoHdfsAuthentication.java:25)
at io.trino.hdfs.HdfsEnvironment.doAs(HdfsEnvironment.java:93)
at io.trino.filesystem.hdfs.HdfsInputFile.openFile(HdfsInputFile.java:108)
at io.trino.filesystem.hdfs.HdfsInputFile.newInput(HdfsInputFile.java:57)
at io.trino.plugin.hive.parquet.TrinoParquetDataSource.<init>(TrinoParquetDataSource.java:39)
at io.trino.plugin.hive.parquet.ParquetPageSourceFactory.createPageSource(ParquetPageSourceFactory.java:227)
... 16 more
Caused by: org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException: HEAD https://account.dfs.core.windows.net/container/prod-data/dev_7.db/test_table_3/prod-data/dev_7.db/test_table_2/part-00000-096fc898-c3f3-4aba-a95e-c7c04beefa9e-c000.snappy.parquet?timeout=90
StatusCode=404
StatusDescription=The specified path does not exist.
ErrorCode=
ErrorMessage=
at org.apache.hadoop.fs.azurebfs.services.AbfsRestOperation.execute(AbfsRestOperation.java:134)
at org.apache.hadoop.fs.azurebfs.services.AbfsClient.getPathProperties(AbfsClient.java:352)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.openFileForRead(AzureBlobFileSystemStore.java:349)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.open(AzureBlobFileSystem.java:174)
... 25 more
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working