3737#include " arrow/util/bitmap_ops.h"
3838#include " arrow/util/checked_cast.h"
3939#include " arrow/util/compression.h"
40+ #include " arrow/util/crc32.h"
4041#include " arrow/util/endian.h"
4142#include " arrow/util/logging.h"
4243#include " arrow/util/rle_encoding.h"
@@ -248,6 +249,7 @@ class SerializedPageWriter : public PageWriter {
248249 SerializedPageWriter (std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
249250 int compression_level, ColumnChunkMetaDataBuilder* metadata,
250251 int16_t row_group_ordinal, int16_t column_chunk_ordinal,
252+ bool use_page_checksum_verification,
251253 MemoryPool* pool = ::arrow::default_memory_pool(),
252254 std::shared_ptr<Encryptor> meta_encryptor = nullptr ,
253255 std::shared_ptr<Encryptor> data_encryptor = nullptr )
@@ -262,6 +264,7 @@ class SerializedPageWriter : public PageWriter {
262264 page_ordinal_(0 ),
263265 row_group_ordinal_(row_group_ordinal),
264266 column_ordinal_(column_chunk_ordinal),
267+ page_checksum_verification_(use_page_checksum_verification),
265268 meta_encryptor_(std::move(meta_encryptor)),
266269 data_encryptor_(std::move(data_encryptor)),
267270 encryption_buffer_(AllocateBuffer(pool, 0 )) {
@@ -379,7 +382,13 @@ class SerializedPageWriter : public PageWriter {
379382 format::PageHeader page_header;
380383 page_header.__set_uncompressed_page_size (static_cast <int32_t >(uncompressed_size));
381384 page_header.__set_compressed_page_size (static_cast <int32_t >(output_data_len));
382- // TODO(PARQUET-594) crc checksum
385+
386+ // TODO(PARQUET-594) crc checksum for DATA_PAGE_V2 and DICT_PAGE
387+ if (page_checksum_verification_ && page.type () == PageType::DATA_PAGE) {
388+ uint32_t crc32 =
389+ ::arrow::internal::crc32 (/* prev */ 0 , output_data_buffer, output_data_len);
390+ page_header.__set_crc (static_cast <int32_t >(crc32));
391+ }
383392
384393 if (page.type () == PageType::DATA_PAGE) {
385394 const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
@@ -425,7 +434,7 @@ class SerializedPageWriter : public PageWriter {
425434 page_header.__set_data_page_header (data_page_header);
426435 }
427436
428- void SetDataPageV2Header (format::PageHeader& page_header, const DataPageV2 page) {
437+ void SetDataPageV2Header (format::PageHeader& page_header, const DataPageV2& page) {
429438 format::DataPageHeaderV2 data_page_header;
430439 data_page_header.__set_num_values (page.num_values ());
431440 data_page_header.__set_num_nulls (page.num_nulls ());
@@ -456,6 +465,8 @@ class SerializedPageWriter : public PageWriter {
456465
457466 int64_t total_uncompressed_size () { return total_uncompressed_size_; }
458467
468+ bool page_checksum_verification () { return page_checksum_verification_; }
469+
459470 private:
460471 // To allow UpdateEncryption on Close
461472 friend class BufferedPageWriter ;
@@ -520,6 +531,7 @@ class SerializedPageWriter : public PageWriter {
520531 int32_t page_ordinal_;
521532 int16_t row_group_ordinal_;
522533 int16_t column_ordinal_;
534+ bool page_checksum_verification_;
523535
524536 std::unique_ptr<ThriftSerializer> thrift_serializer_;
525537
@@ -544,15 +556,16 @@ class BufferedPageWriter : public PageWriter {
544556 BufferedPageWriter (std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
545557 int compression_level, ColumnChunkMetaDataBuilder* metadata,
546558 int16_t row_group_ordinal, int16_t current_column_ordinal,
559+ bool use_page_checksum_verification,
547560 MemoryPool* pool = ::arrow::default_memory_pool(),
548561 std::shared_ptr<Encryptor> meta_encryptor = nullptr ,
549562 std::shared_ptr<Encryptor> data_encryptor = nullptr )
550563 : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false ) {
551564 in_memory_sink_ = CreateOutputStream (pool);
552565 pager_ = std::make_unique<SerializedPageWriter>(
553566 in_memory_sink_, codec, compression_level, metadata, row_group_ordinal,
554- current_column_ordinal, pool, std::move (meta_encryptor) ,
555- std::move (data_encryptor));
567+ current_column_ordinal, use_page_checksum_verification, pool ,
568+ std::move (meta_encryptor), std::move ( data_encryptor));
556569 }
557570
558571 int64_t WriteDictionaryPage (const DictionaryPage& page) override {
@@ -606,15 +619,17 @@ std::unique_ptr<PageWriter> PageWriter::Open(
606619 int compression_level, ColumnChunkMetaDataBuilder* metadata,
607620 int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
608621 bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
609- std::shared_ptr<Encryptor> data_encryptor) {
622+ std::shared_ptr<Encryptor> data_encryptor, bool page_write_checksum_enabled ) {
610623 if (buffered_row_group) {
611- return std::make_unique<BufferedPageWriter> (
624+ return std::unique_ptr<PageWriter>( new BufferedPageWriter (
612625 std::move (sink), codec, compression_level, metadata, row_group_ordinal,
613- column_chunk_ordinal, pool, std::move (meta_encryptor), std::move (data_encryptor));
626+ column_chunk_ordinal, page_write_checksum_enabled, pool,
627+ std::move (meta_encryptor), std::move (data_encryptor)));
614628 } else {
615- return std::make_unique<SerializedPageWriter> (
629+ return std::unique_ptr<PageWriter>( new SerializedPageWriter (
616630 std::move (sink), codec, compression_level, metadata, row_group_ordinal,
617- column_chunk_ordinal, pool, std::move (meta_encryptor), std::move (data_encryptor));
631+ column_chunk_ordinal, page_write_checksum_enabled, pool,
632+ std::move (meta_encryptor), std::move (data_encryptor)));
618633 }
619634}
620635
0 commit comments