-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-9067][SQL] Close reader in NewHadoopRDD early if there is no more data #7424
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
67569da
a305621
f429016
216912f
50ed729
e34d98e
3ceb755
5116cbe
e152182
3d20267
3ff64e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -128,7 +128,7 @@ class NewHadoopRDD[K, V]( | |
| configurable.setConf(conf) | ||
| case _ => | ||
| } | ||
| val reader = format.createRecordReader( | ||
| private var reader = format.createRecordReader( | ||
| split.serializableHadoopSplit.value, hadoopAttemptContext) | ||
| reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext) | ||
|
|
||
|
|
@@ -141,6 +141,10 @@ class NewHadoopRDD[K, V]( | |
| override def hasNext: Boolean = { | ||
| if (!finished && !havePair) { | ||
| finished = !reader.nextKeyValue | ||
| if (finished) { | ||
| // Close reader and release it | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you change this comment (and the one in SqlNewHadoopRdd) to say something like "Close and release the reader here; close() will also be called when the task completes, but for tasks that read from many files, it helps to release the resources early"? I'm just worried this could be removed later on if someone things it's redundant with the close() in task completion listener.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion. I added it. |
||
| close() | ||
| } | ||
| havePair = !finished | ||
| } | ||
| !finished | ||
|
|
@@ -159,18 +163,23 @@ class NewHadoopRDD[K, V]( | |
|
|
||
| private def close() { | ||
| try { | ||
| reader.close() | ||
| if (bytesReadCallback.isDefined) { | ||
| inputMetrics.updateBytesRead() | ||
| } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] || | ||
| split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) { | ||
| // If we can't get the bytes read from the FS stats, fall back to the split size, | ||
| // which may be inaccurate. | ||
| try { | ||
| inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength) | ||
| } catch { | ||
| case e: java.io.IOException => | ||
| logWarning("Unable to get input size to set InputMetrics for task", e) | ||
| if (reader != null) { | ||
| // Close reader and release it | ||
| reader.close() | ||
| reader = null | ||
|
|
||
| if (bytesReadCallback.isDefined) { | ||
| inputMetrics.updateBytesRead() | ||
| } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] || | ||
| split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) { | ||
| // If we can't get the bytes read from the FS stats, fall back to the split size, | ||
| // which may be inaccurate. | ||
| try { | ||
| inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength) | ||
| } catch { | ||
| case e: java.io.IOException => | ||
| logWarning("Unable to get input size to set InputMetrics for task", e) | ||
| } | ||
| } | ||
| } | ||
| } catch { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would it make sense if we just call close here?