[HUDI-3723] Fixed stack overflows in Spark MOR Record Iterators #5235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

nsivabalan merged 1 commit into apache:master from onehouseinc:ak/mor-rdd-rec-fix

Apr 6, 2022

...rk-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -50,6 +50,7 @@ import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskCont
  
    import java.io.Closeable

    import java.util.Properties

    import scala.annotation.tailrec

    import scala.collection.JavaConverters._

    import scala.util.Try

    @@ -188,17 +189,23 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
  
        protected def removeLogRecord(key: String): Option[HoodieRecord[_ <: HoodieRecordPayload[_]]] =

          logRecords.remove(key)

        override def hasNext: Boolean =

        override def hasNext: Boolean = hasNextInternal

        // NOTE: It's crucial for this method to be annotated w/ [[@tailrec]] to make sure

        //       that recursion is unfolded into a loop to avoid stack overflows while

        //       handling records

        @tailrec private def hasNextInternal: Boolean = {

          logRecordsIterator.hasNext && {

            val avroRecordOpt = logRecordsIterator.next()

            if (avroRecordOpt.isEmpty) {

              // Record has been deleted, skipping

              this.hasNext

              this.hasNextInternal

            } else {

              recordToLoad = unsafeProjection(deserialize(avroRecordOpt.get))

              true

            }

          }

        }

        override final def next(): InternalRow = recordToLoad

    @@ -257,7 +264,12 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
  
        private val recordKeyOrdinal = baseFileReaderSchema.structTypeSchema.fieldIndex(tableState.recordKeyField)

        override def hasNext: Boolean = {

        override def hasNext: Boolean = hasNextInternal

        // NOTE: It's crucial for this method to be annotated w/ [[@tailrec]] to make sure

        //       that recursion is unfolded into a loop to avoid stack overflows while

        //       handling records

        @tailrec private def hasNextInternal: Boolean = {

          if (baseFileIterator.hasNext) {

            val curRowRecord = baseFileIterator.next()

            val curKey = curRowRecord.getString(recordKeyOrdinal)

    @@ -270,7 +282,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
  
              val mergedAvroRecordOpt = merge(serialize(curRowRecord), updatedRecordOpt.get)

              if (mergedAvroRecordOpt.isEmpty) {

                // Record has been deleted, skipping

                this.hasNext

                this.hasNextInternal

              } else {

                // NOTE: In occurrence of a merge we can't know the schema of the record being returned, b/c

                //       record from the Delta Log will bear (full) Table schema, while record from the Base file

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[HUDI-3723] Fixed stack overflows in Spark MOR Record Iterators #5235

Uh oh!

Diff view

Diff view

There are no files selected for viewing