diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala index f517b6033d94..7c571353c587 100644 --- a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala +++ b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala @@ -37,6 +37,14 @@ private object CosmosTableSchemaInferrer ResourceIdAttributeName, AttachmentsAttributeName) + private val notNullableProperties = List( + IdAttributeName, + ETagAttributeName, + SelfAttributeName, + ResourceIdAttributeName, + TimestampAttributeName, + AttachmentsAttributeName) + private[spark] def inferSchema( inferredItems: Seq[ObjectNode], includeSystemProperties: Boolean, @@ -125,7 +133,7 @@ private object CosmosTableSchemaInferrer case anyType: DataType => field.getKey -> StructField( field.getKey, anyType, - nullable= !systemProperties.contains(field.getKey) && allowNullForInferredProperties) + nullable= !notNullableProperties.contains(field.getKey) && allowNullForInferredProperties) }) .toSeq) } diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrerSpec.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrerSpec.scala index 382f846df51f..484e2c42b85a 100644 --- a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrerSpec.scala +++ b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrerSpec.scala @@ -50,7 +50,7 @@ class CosmosTableSchemaInferrerSpec extends UnitSpec { schema.fields(1).name shouldBe "otherProperty" schema.fields(0).dataType shouldBe StringType schema.fields(1).dataType shouldBe StringType - schema.fields(0).nullable shouldBe true + schema.fields(0).nullable shouldBe false schema.fields(1).nullable shouldBe true } diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala index bae610c7bf0e..a814e5e0e4f8 100644 --- a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala +++ b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala @@ -295,6 +295,12 @@ class SparkE2EQueryITest fieldNames.contains(CosmosTableSchemaInferrer.ResourceIdAttributeName) shouldBe true fieldNames.contains(CosmosTableSchemaInferrer.ETagAttributeName) shouldBe true fieldNames.contains(CosmosTableSchemaInferrer.AttachmentsAttributeName) shouldBe true + + rowWithInference.schema(CosmosTableSchemaInferrer.SelfAttributeName).nullable shouldBe false + rowWithInference.schema(CosmosTableSchemaInferrer.TimestampAttributeName).nullable shouldBe false + rowWithInference.schema(CosmosTableSchemaInferrer.ResourceIdAttributeName).nullable shouldBe false + rowWithInference.schema(CosmosTableSchemaInferrer.ETagAttributeName).nullable shouldBe false + rowWithInference.schema(CosmosTableSchemaInferrer.AttachmentsAttributeName).nullable shouldBe false } "spark query" can "use schema inference with just timestamp" in { @@ -438,7 +444,7 @@ class SparkE2EQueryITest "spark.cosmos.database" -> cosmosDatabase, "spark.cosmos.container" -> cosmosContainer, "spark.cosmos.read.inferSchema.enabled" -> "true", - "spark.cosmos.read.inferSchema.query" -> "select TOP 1 c.type, c.age, c.isAlive, c._ts from c", + "spark.cosmos.read.inferSchema.query" -> "select TOP 1 c.type, c.age, c.isAlive, c._ts, c.id from c", "spark.cosmos.read.partitioning.strategy" -> "Restrictive" ) @@ -455,9 +461,16 @@ class SparkE2EQueryITest val fieldNames = rowWithInference.schema.fields.map(field => field.name) fieldNames.contains(CosmosTableSchemaInferrer.SelfAttributeName) shouldBe false fieldNames.contains(CosmosTableSchemaInferrer.TimestampAttributeName) shouldBe true + fieldNames.contains(CosmosTableSchemaInferrer.IdAttributeName) shouldBe true fieldNames.contains(CosmosTableSchemaInferrer.ResourceIdAttributeName) shouldBe false fieldNames.contains(CosmosTableSchemaInferrer.ETagAttributeName) shouldBe false fieldNames.contains(CosmosTableSchemaInferrer.AttachmentsAttributeName) shouldBe false + + rowWithInference.schema(CosmosTableSchemaInferrer.TimestampAttributeName).nullable shouldBe false + rowWithInference.schema(CosmosTableSchemaInferrer.IdAttributeName).nullable shouldBe false + rowWithInference.schema("type").nullable shouldBe true + rowWithInference.schema("age").nullable shouldBe true + rowWithInference.schema("isAlive").nullable shouldBe true } "spark query" can "when forceNullableProperties is false and rows have different schema" in {