apache · alamb · Jul 2, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 27, 2024
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -210,6 +210,10 @@ impl FallbackEncoder {
         }
     }
 
+    /// Returns an estimate of the data page size in bytes
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     fn estimated_data_page_size(&self) -> usize {
         match &self.encoder {
             FallbackEncoderImpl::Plain { buffer, .. } => buffer.len(),
@@ -334,6 +338,10 @@ impl DictEncoder {
         num_required_bits(length.saturating_sub(1) as u64)
     }
 
+    fn estimated_memory_size(&self) -> usize {
+        self.interner.storage().page.len() + self.indices.len() * 8
+    }
+
     fn estimated_data_page_size(&self) -> usize {
         let bit_width = self.bit_width();
         1 + RleEncoder::max_buffer_size(bit_width, self.indices.len())
@@ -443,10 +451,23 @@ impl ColumnValueEncoder for ByteArrayEncoder {
         self.dict_encoder.is_some()
     }
 
+    fn estimated_memory_size(&self) -> usize {
+        match &self.dict_encoder {
+            Some(encoder) => encoder.estimated_memory_size(),
+            // For the FallbackEncoder, these unflushed bytes are already encoded.
+            // Therefore, the size should be the same as estimated_data_page_size.
+            None => self.fallback.estimated_data_page_size(),
+        }
+    }
+
     fn estimated_dict_page_size(&self) -> Option<usize> {
         Some(self.dict_encoder.as_ref()?.estimated_dict_page_size())
     }
 
+    /// Returns an estimate of the data page size in bytes
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     fn estimated_data_page_size(&self) -> usize {
         match &self.dict_encoder {
             Some(encoder) => encoder.estimated_data_page_size(),

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -203,7 +203,23 @@ impl<W: Write + Send> ArrowWriter<W> {
         self.writer.flushed_row_groups()
     }
 
-    /// Returns the estimated length in bytes of the current in progress row group
+    /// Returns the estimated memory usage of the current in progress row group.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <current_memory_size_of_unflushed_bytes> + <bytes_associated_with_processing>
+    ///
+    /// In the vast majority of cases our unflushed bytes are already encoded.
+    pub fn memory_size(&self) -> usize {
+        match &self.in_progress {
+            Some(in_progress) => in_progress.writers.iter().map(|x| x.memory_size()).sum(),
+            None => 0,
-        match &self.in_progress {
-            Some(in_progress) => in_progress.writers.iter().map(|x| x.memory_size()).sum(),
-            None => 0,
+        match &self.in_progress {
+            Some(in_progress) => in_progress.writers.iter().map(|x| x.memory_size() + x.get_estimated_total_bytes()).sum(),
+            None => 0,
-        match &self.in_progress {
-            Some(in_progress) => in_progress.writers.iter().map(|x| x.memory_size()).sum(),
-            None => 0,
+        match &self.in_progress {
+            Some(in_progress) => in_progress.writers.iter().map(|x| x.memory_size() + x.get_estimated_total_bytes()).sum(),
+            None => 0,
+        }
+    }
+
+    /// Returns the estimated length in encoded bytes of the current in progress row group.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     pub fn in_progress_size(&self) -> usize {
         match &self.in_progress {
             Some(in_progress) => in_progress
@@ -629,7 +645,26 @@ impl ArrowColumnWriter {
         Ok(ArrowColumnChunk { data, close })
     }
 
-    /// Returns the estimated total bytes for this column writer
+    /// Returns the estimated total memory usage.
+    ///
+    /// Unlike [`Self::get_estimated_total_bytes`] this is an estimate
+    /// of the current memory usage and not it's anticipated encoded size.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <current_memory_size_of_unflushed_bytes> + <bytes_associated_with_processing>
+    ///
+    /// In the vast majority of cases our unflushed bytes are already encoded.
+    pub fn memory_size(&self) -> usize {
+        match &self.writer {
+            ArrowColumnWriterImpl::ByteArray(c) => c.memory_size(),
+            ArrowColumnWriterImpl::Column(c) => c.memory_size(),
+        }
+    }
+
+    /// Returns the estimated total encoded bytes for this column writer.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     pub fn get_estimated_total_bytes(&self) -> usize {
         match &self.writer {
             ArrowColumnWriterImpl::ByteArray(c) => c.get_estimated_total_bytes() as _,
@@ -2894,24 +2929,28 @@ mod tests {
         // starts empty
         assert_eq!(writer.in_progress_size(), 0);
         assert_eq!(writer.in_progress_rows(), 0);
+        assert_eq!(writer.memory_size(), 0);
         assert_eq!(writer.bytes_written(), 4); // Initial header
         writer.write(&batch).unwrap();
 
         // updated on write
         let initial_size = writer.in_progress_size();
         assert!(initial_size > 0);
         assert_eq!(writer.in_progress_rows(), 5);
+        let initial_memory = writer.memory_size();
+        assert!(initial_memory > 0);
 
         // updated on second write
         writer.write(&batch).unwrap();
         assert!(writer.in_progress_size() > initial_size);
         assert_eq!(writer.in_progress_rows(), 10);
+        assert!(writer.memory_size() > initial_memory);
 
         // in progress tracking is cleared, but the overall data written is updated
         let pre_flush_bytes_written = writer.bytes_written();
         writer.flush().unwrap();
         assert_eq!(writer.in_progress_size(), 0);
-        assert_eq!(writer.in_progress_rows(), 0);
+        assert_eq!(writer.memory_size(), 0);
         assert!(writer.bytes_written() > pre_flush_bytes_written);
 
         writer.close().unwrap();

diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs
@@ -93,10 +93,19 @@ pub trait ColumnValueEncoder {
     /// Returns true if this encoder has a dictionary page
     fn has_dictionary(&self) -> bool;
 
+    /// Returns the estimated total memory usage of the encoder
+    ///
+    /// This should include:
+    /// <already_written_encoded_byte_size> + <current_memory_size_of_unflushed_bytes> + <bytes_associated_with_processing>
+    fn estimated_memory_size(&self) -> usize;
+
     /// Returns an estimate of the dictionary page size in bytes, or `None` if no dictionary
     fn estimated_dict_page_size(&self) -> Option<usize>;
 
     /// Returns an estimate of the data page size in bytes
+    ///
+    /// This should include:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     fn estimated_data_page_size(&self) -> usize;
 
     /// Flush the dictionary page for this column chunk if any. Any subsequent calls to
@@ -227,6 +236,16 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> {
         self.dict_encoder.is_some()
     }
 
+    fn estimated_memory_size(&self) -> usize {
+        match &self.dict_encoder {
+            // For this DictEncoder, we have unencoded bytes in the buffer.
+            Some(encoder) => encoder.estimated_memory_size(),
+            // Whereas for all other encoders the buffer contains encoded bytes.
+            // Therefore, we can use the estimated_data_encoded_size.
+            _ => self.encoder.estimated_data_encoded_size(),
+        }
+    }
+
     fn estimated_dict_page_size(&self) -> Option<usize> {
         Some(self.dict_encoder.as_ref()?.dict_encoded_size())
     }

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
@@ -72,7 +72,13 @@ pub enum ColumnWriter<'a> {
 }
 
 impl<'a> ColumnWriter<'a> {
-    /// Returns the estimated total bytes for this column writer
+    /// Returns the estimated total memory usage
+    #[cfg(feature = "arrow")]
+    pub(crate) fn memory_size(&self) -> usize {
+        downcast_writer!(self, typed, typed.memory_size())
+    }
+
+    /// Returns the estimated total encoded bytes for this column writer
     #[cfg(feature = "arrow")]
     pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
         downcast_writer!(self, typed, typed.get_estimated_total_bytes())
@@ -419,6 +425,20 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
         )
     }
 
+    /// Returns the estimated total memory usage.
+    ///
+    /// Unlike [`Self::get_estimated_total_bytes`] this is an estimate
+    /// of the current memory usage and not it's anticipated encoded size.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <current_memory_size_of_unflushed_bytes> + <bytes_associated_with_processing>
+    ///
+    /// In the vast majority of cases our unflushed bytes are already encoded.
+    #[cfg(feature = "arrow")]
+    pub(crate) fn memory_size(&self) -> usize {
+        self.column_metrics.total_bytes_written as usize + self.encoder.estimated_memory_size()
+    }
+
     /// Returns total number of bytes written by this column writer so far.
     /// This value is also returned when column writer is closed.
     ///
@@ -428,10 +448,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
         self.column_metrics.total_bytes_written
     }
 
-    /// Returns the estimated total bytes for this column writer
+    /// Returns the estimated total encoded bytes for this column writer.
     ///
     /// Unlike [`Self::get_total_bytes_written`] this includes an estimate
-    /// of any data that has not yet been flushed to a page
+    /// of any data that has not yet been flushed to a page, based on it's
+    /// anticipated encoded size.
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     #[cfg(feature = "arrow")]
     pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
         self.column_metrics.total_bytes_written

diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -143,6 +143,16 @@ impl<T: DataType> DictEncoder<T> {
     fn bit_width(&self) -> u8 {
         num_required_bits(self.num_entries().saturating_sub(1) as u64)
     }
+
+    /// Returns the estimated total memory usage
+    ///
+    /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
+    /// 
+    /// Therefore, we have a specific memory estimate (during encoding) of:
+    /// <already_written_encoded_byte_size> + <current_memory_size_of_unflushed_bytes>
+    pub(crate) fn estimated_memory_size(&self) -> usize {
+        self.interner.storage().size_in_bytes + self.indices.len() * 8
+    }
 }
 
 impl<T: DataType> Encoder<T> for DictEncoder<T> {
@@ -161,6 +171,10 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
         Encoding::PLAIN_DICTIONARY
     }
 
+    /// Returns an estimate of the data page size in bytes
+    ///
+    /// This includes:
+    /// <already_written_encoded_byte_size> + <estimated_encoded_size_of_unflushed_bytes>
     fn estimated_data_encoded_size(&self) -> usize {
         let bit_width = self.bit_width();
         RleEncoder::max_buffer_size(bit_width, self.indices.len())