Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1308,6 +1308,12 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]")
assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]")
assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]")

// edge-case: we should consider the fact non-path chars including '[' and ", " are accounted
// 1. second path is not added due to the addition of '['
assert(Utils.buildLocationMetadata(paths, 6) == "[path0]")
// 2. third path is not added due to the addition of ", "
assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]")
}

test("checkHost supports both IPV4 and IPV6") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
import java.io.File

import scala.collection.mutable
import scala.util.Random

import org.apache.hadoop.fs.Path

Expand Down Expand Up @@ -122,24 +123,45 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") {
withTempPath { path =>
val dir = path.getCanonicalPath

// create a sub-directory with long name so that each root path will always exceed the limit
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just ensure we test on path truncation; other cases should be tested against UtilsSuite.

// this is to ensure we always test the case for the path truncation
// 110 = limit 100 + margin 10 to clearly avoid edge case
val dataDir = if (dir.length >= 110) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit comment; for test simplicity, how about always creating a new dir with fixed-length long name (e.g., 110?) instead of checking the length (if (dir.length >= 110))?

        val tooLongDirName = Random.alphanumeric.take(110).toList.mkString
        val f = new File(path, tooLongDirName)
        f.mkdir()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure about the possible glitch on long long path (like exceeding 255 chars), but I agree that should be simpler if there's no problem on the length of path. Do you have any idea on this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. If so, the current one looks fine. Could you leave some comments there about the limit? I didn't know it and it seems the max name length is 255 and the max path length is 4096? (I googled it)

Copy link
Contributor Author

@HeartSaVioR HeartSaVioR Feb 3, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know about the actual problem; probably just worrying too much (that's why I said "not 100% sure"). Let's simplify as you suggested and revisit if we trigger any glitch.

path
} else {
val dataDirName = Random.alphanumeric.take(110 - dir.length).toList.mkString
val f = new File(path, dataDirName)
f.mkdir()
f
}

val partitionCol = "partitionCol"
spark.range(10)
.select("id", "id")
.toDF("value", partitionCol)
.write
.partitionBy(partitionCol)
.orc(dir)
val paths = (0 to 9).map(i => new File(dir, s"$partitionCol=$i").getCanonicalPath)
.orc(dataDir.getCanonicalPath)
val paths = (0 to 9).map(i => new File(dataDir, s"$partitionCol=$i").getCanonicalPath)
val plan = spark.read.orc(paths: _*).queryExecution.executedPlan
val location = plan collectFirst {
case f: FileSourceScanExec => f.metadata("Location")
}
assert(location.isDefined)
// The location metadata should at least contain one path
assert(location.get.contains(paths.head))
// If the temp path length is larger than 100, the metadata length should not exceed
// twice of the length; otherwise, the metadata length should be controlled within 200.
assert(location.get.length < Math.max(paths.head.length, 100) * 2)

// The location metadata should have bracket wrapping paths
assert(location.get.indexOf('[') > -1)
assert(location.get.indexOf(']') > -1)

// extract paths in location metadata (removing classname, brackets, separators)
val pathsInLocation = location.get.substring(
location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq

// the only one path should be available
assert(pathsInLocation.size == 1)
}
}
}
Expand Down