1
1
#include " ArchiveWriter.hpp"
2
2
3
3
#include < algorithm>
4
+ #include < filesystem>
5
+ #include < sstream>
4
6
5
7
#include < json/single_include/nlohmann/json.hpp>
6
8
@@ -13,18 +15,23 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) {
13
15
m_id = boost::uuids::to_string (option.id );
14
16
m_compression_level = option.compression_level ;
15
17
m_print_archive_stats = option.print_archive_stats ;
18
+ m_single_file_archive = option.single_file_archive ;
16
19
m_min_table_size = option.min_table_size ;
17
- auto archive_path = boost::filesystem::path (option.archives_dir ) / m_id;
20
+ m_archives_dir = option.archives_dir ;
21
+ std::string working_dir_name = m_id;
22
+ if (option.single_file_archive ) {
23
+ working_dir_name += constants::cTmpPostfix;
24
+ }
25
+ auto archive_path = std::filesystem::path (option.archives_dir ) / working_dir_name;
18
26
19
- boost::system ::error_code boost_error_code;
20
- bool path_exists = boost::filesystem::exists (archive_path, boost_error_code);
21
- if (path_exists) {
27
+ std::error_code ec;
28
+ if (std::filesystem::exists (archive_path, ec)) {
22
29
SPDLOG_ERROR (" Archive path already exists: {}" , archive_path.c_str ());
23
30
throw OperationFailed (ErrorCodeUnsupported, __FILENAME__, __LINE__);
24
31
}
25
32
26
33
m_archive_path = archive_path.string ();
27
- if (false == boost ::filesystem::create_directory (m_archive_path)) {
34
+ if (false == std ::filesystem::create_directory (m_archive_path, ec )) {
28
35
throw OperationFailed (ErrorCodeErrno, __FILENAME__, __LINE__);
29
36
}
30
37
@@ -39,20 +46,42 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) {
39
46
std::string array_dict_path = m_archive_path + constants::cArchiveArrayDictFile;
40
47
m_array_dict = std::make_shared<LogTypeDictionaryWriter>();
41
48
m_array_dict->open (array_dict_path, m_compression_level, UINT64_MAX);
42
-
43
- std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile;
44
- m_timestamp_dict = std::make_shared<TimestampDictionaryWriter>();
45
- m_timestamp_dict->open (timestamp_dict_path, m_compression_level);
46
49
}
47
50
48
51
void ArchiveWriter::close () {
49
- m_compressed_size += m_var_dict->close ();
50
- m_compressed_size += m_log_dict->close ();
51
- m_compressed_size += m_array_dict->close ();
52
- m_compressed_size += m_timestamp_dict->close ();
53
- m_compressed_size += m_schema_tree.store (m_archive_path, m_compression_level);
54
- m_compressed_size += m_schema_map.store (m_archive_path, m_compression_level);
55
- m_compressed_size += store_tables ();
52
+ auto var_dict_compressed_size = m_var_dict->close ();
53
+ auto log_dict_compressed_size = m_log_dict->close ();
54
+ auto array_dict_compressed_size = m_array_dict->close ();
55
+ auto schema_tree_compressed_size = m_schema_tree.store (m_archive_path, m_compression_level);
56
+ auto schema_map_compressed_size = m_schema_map.store (m_archive_path, m_compression_level);
57
+ auto [table_metadata_compressed_size, table_compressed_size] = store_tables ();
58
+
59
+ if (m_single_file_archive) {
60
+ std::vector<ArchiveFileInfo> files{
61
+ {constants::cArchiveSchemaTreeFile, schema_tree_compressed_size},
62
+ {constants::cArchiveSchemaMapFile, schema_map_compressed_size},
63
+ {constants::cArchiveTableMetadataFile, table_metadata_compressed_size},
64
+ {constants::cArchiveVarDictFile, var_dict_compressed_size},
65
+ {constants::cArchiveLogDictFile, log_dict_compressed_size},
66
+ {constants::cArchiveArrayDictFile, array_dict_compressed_size},
67
+ {constants::cArchiveTablesFile, table_compressed_size}
68
+ };
69
+ uint64_t offset = 0 ;
70
+ for (auto & file : files) {
71
+ uint64_t original_size = file.o ;
72
+ file.o = offset;
73
+ offset += original_size;
74
+ }
75
+ write_single_file_archive (files);
76
+ } else {
77
+ // Timestamp dictionary written separately here until we transition to moving it inside of
78
+ // the metadata region of multi-file archives.
79
+ auto timestamp_dict_compressed_size = write_timestamp_dict ();
80
+ m_compressed_size = var_dict_compressed_size + log_dict_compressed_size
81
+ + array_dict_compressed_size + timestamp_dict_compressed_size
82
+ + schema_tree_compressed_size + schema_map_compressed_size
83
+ + table_metadata_compressed_size + table_compressed_size;
84
+ }
56
85
57
86
if (m_metadata_db) {
58
87
update_metadata_db ();
@@ -65,12 +94,130 @@ void ArchiveWriter::close() {
65
94
m_id_to_schema_writer.clear ();
66
95
m_schema_tree.clear ();
67
96
m_schema_map.clear ();
97
+ m_timestamp_dict.clear ();
68
98
m_encoded_message_size = 0UL ;
69
99
m_uncompressed_size = 0UL ;
70
100
m_compressed_size = 0UL ;
71
101
m_next_log_event_id = 0 ;
72
102
}
73
103
104
+ size_t ArchiveWriter::write_timestamp_dict () {
105
+ std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile;
106
+ FileWriter timestamp_dict_file_writer;
107
+ ZstdCompressor timestamp_dict_compressor;
108
+ timestamp_dict_file_writer.open (timestamp_dict_path, FileWriter::OpenMode::CreateForWriting);
109
+ timestamp_dict_compressor.open (timestamp_dict_file_writer, m_compression_level);
110
+ std::stringstream timestamp_dict_stream;
111
+ m_timestamp_dict.write (timestamp_dict_stream);
112
+ std::string encoded_timestamp_dict = timestamp_dict_stream.str ();
113
+ timestamp_dict_compressor.write (encoded_timestamp_dict.data (), encoded_timestamp_dict.size ());
114
+ timestamp_dict_compressor.close ();
115
+ auto compressed_size = timestamp_dict_file_writer.get_pos ();
116
+ timestamp_dict_file_writer.close ();
117
+ return compressed_size;
118
+ }
119
+
120
+ void ArchiveWriter::write_single_file_archive (std::vector<ArchiveFileInfo> const & files) {
121
+ std::string single_file_archive_path = (std::filesystem::path (m_archives_dir) / m_id).string ();
122
+ FileWriter archive_writer;
123
+ archive_writer.open (single_file_archive_path, FileWriter::OpenMode::CreateForWriting);
124
+
125
+ write_archive_metadata (archive_writer, files);
126
+ size_t metadata_section_size = archive_writer.get_pos () - sizeof (ArchiveHeader);
127
+ write_archive_files (archive_writer, files);
128
+ m_compressed_size = archive_writer.get_pos ();
129
+ write_archive_header (archive_writer, metadata_section_size);
130
+
131
+ archive_writer.close ();
132
+ std::error_code ec;
133
+ if (false == std::filesystem::remove (m_archive_path, ec)) {
134
+ throw OperationFailed (ErrorCodeFileExists, __FILENAME__, __LINE__);
135
+ }
136
+ }
137
+
138
+ void ArchiveWriter::write_archive_metadata (
139
+ FileWriter& archive_writer,
140
+ std::vector<ArchiveFileInfo> const & files
141
+ ) {
142
+ archive_writer.seek_from_begin (sizeof (ArchiveHeader));
143
+
144
+ ZstdCompressor compressor;
145
+ compressor.open (archive_writer, m_compression_level);
146
+ compressor.write_numeric_value (static_cast <uint8_t >(3U )); // Number of packets
147
+
148
+ // Write archive info
149
+ ArchiveInfoPacket archive_info{.num_segments = 1 };
150
+ std::stringstream msgpack_buffer;
151
+ msgpack::pack (msgpack_buffer, archive_info);
152
+ std::string archive_info_str = msgpack_buffer.str ();
153
+ compressor.write_numeric_value (ArchiveMetadataPacketType::ArchiveInfo);
154
+ compressor.write_numeric_value (static_cast <uint32_t >(archive_info_str.size ()));
155
+ compressor.write_string (archive_info_str);
156
+
157
+ // Write archive file info
158
+ ArchiveFileInfoPacket archive_file_info{.files {files}};
159
+ msgpack_buffer = std::stringstream{};
160
+ msgpack::pack (msgpack_buffer, archive_file_info);
161
+ std::string archive_file_info_str = msgpack_buffer.str ();
162
+ compressor.write_numeric_value (ArchiveMetadataPacketType::ArchiveFileInfo);
163
+ compressor.write_numeric_value (static_cast <uint32_t >(archive_file_info_str.size ()));
164
+ compressor.write_string (archive_file_info_str);
165
+
166
+ // Write timestamp dictionary
167
+ compressor.write_numeric_value (ArchiveMetadataPacketType::TimestampDictionary);
168
+ std::stringstream timestamp_dict_stream;
169
+ m_timestamp_dict.write (timestamp_dict_stream);
170
+ std::string encoded_timestamp_dict = timestamp_dict_stream.str ();
171
+ compressor.write_numeric_value (static_cast <uint32_t >(encoded_timestamp_dict.size ()));
172
+ compressor.write (encoded_timestamp_dict.data (), encoded_timestamp_dict.size ());
173
+
174
+ compressor.close ();
175
+ }
176
+
177
+ void ArchiveWriter::write_archive_files (
178
+ FileWriter& archive_writer,
179
+ std::vector<ArchiveFileInfo> const & files
180
+ ) {
181
+ FileReader reader;
182
+ for (auto const & file : files) {
183
+ std::string file_path = m_archive_path + file.n ;
184
+ reader.open (file_path);
185
+ char read_buffer[cReadBlockSize];
186
+ while (true ) {
187
+ size_t num_bytes_read{0 };
188
+ ErrorCode const error_code
189
+ = reader.try_read (read_buffer, cReadBlockSize, num_bytes_read);
190
+ if (ErrorCodeEndOfFile == error_code) {
191
+ break ;
192
+ } else if (ErrorCodeSuccess != error_code) {
193
+ throw OperationFailed (error_code, __FILENAME__, __LINE__);
194
+ }
195
+ archive_writer.write (read_buffer, num_bytes_read);
196
+ }
197
+ reader.close ();
198
+ if (false == std::filesystem::remove (file_path)) {
199
+ throw OperationFailed (ErrorCodeFileExists, __FILENAME__, __LINE__);
200
+ }
201
+ }
202
+ }
203
+
204
+ void ArchiveWriter::write_archive_header (FileWriter& archive_writer, size_t metadata_section_size) {
205
+ ArchiveHeader header{
206
+ .magic_number {0 },
207
+ .version
208
+ = (cArchiveMajorVersion << 24 ) | (cArchiveMinorVersion << 16 ) | cArchivePatchVersion,
209
+ .uncompressed_size = m_uncompressed_size,
210
+ .compressed_size = m_compressed_size,
211
+ .reserved_padding {0 },
212
+ .metadata_section_size = static_cast <uint32_t >(metadata_section_size),
213
+ .compression_type = static_cast <uint16_t >(ArchiveCompressionType::Zstd),
214
+ .padding = 0
215
+ };
216
+ std::memcpy (&header.magic_number , cStructuredSFAMagicNumber, sizeof (header.magic_number ));
217
+ archive_writer.seek_from_begin (0 );
218
+ archive_writer.write (reinterpret_cast <char const *>(&header), sizeof (header));
219
+ }
220
+
74
221
void ArchiveWriter::append_message (
75
222
int32_t schema_id,
76
223
Schema const & schema,
@@ -132,8 +279,7 @@ void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, Schema const&
132
279
}
133
280
}
134
281
135
- size_t ArchiveWriter::store_tables () {
136
- size_t compressed_size = 0 ;
282
+ std::pair<size_t , size_t > ArchiveWriter::store_tables () {
137
283
m_tables_file_writer.open (
138
284
m_archive_path + constants::cArchiveTablesFile,
139
285
FileWriter::OpenMode::CreateForWriting
@@ -243,13 +389,13 @@ size_t ArchiveWriter::store_tables() {
243
389
}
244
390
m_table_metadata_compressor.close ();
245
391
246
- compressed_size + = m_table_metadata_file_writer.get_pos ();
247
- compressed_size + = m_tables_file_writer.get_pos ();
392
+ auto table_metadata_compressed_size = m_table_metadata_file_writer.get_pos ();
393
+ auto table_compressed_size = m_tables_file_writer.get_pos ();
248
394
249
395
m_table_metadata_file_writer.close ();
250
396
m_tables_file_writer.close ();
251
397
252
- return compressed_size ;
398
+ return {table_metadata_compressed_size, table_compressed_size} ;
253
399
}
254
400
255
401
void ArchiveWriter::update_metadata_db () {
@@ -262,8 +408,8 @@ void ArchiveWriter::update_metadata_db() {
262
408
metadata.increment_static_compressed_size (m_compressed_size);
263
409
metadata.increment_static_uncompressed_size (m_uncompressed_size);
264
410
metadata.expand_time_range (
265
- m_timestamp_dict-> get_begin_timestamp (),
266
- m_timestamp_dict-> get_end_timestamp ()
411
+ m_timestamp_dict. get_begin_timestamp (),
412
+ m_timestamp_dict. get_end_timestamp ()
267
413
);
268
414
269
415
m_metadata_db->add_archive (m_id, metadata);
0 commit comments