diff --git a/components/core/cmake/Options/options.cmake b/components/core/cmake/Options/options.cmake index 1030fa0a61..a08403f692 100644 --- a/components/core/cmake/Options/options.cmake +++ b/components/core/cmake/Options/options.cmake @@ -132,6 +132,7 @@ function(validate_clp_binaries_dependencies) CLP_BUILD_CLP_S_SEARCH CLP_BUILD_CLP_S_SEARCH_AST CLP_BUILD_CLP_S_SEARCH_KQL + CLP_BUILD_CLP_S_TIMESTAMP_PARSER ) endfunction() @@ -207,6 +208,7 @@ function(validate_clp_s_archivereader_dependencies) CLP_BUILD_CLP_STRING_UTILS CLP_BUILD_CLP_S_CLP_DEPENDENCIES CLP_BUILD_CLP_S_IO + CLP_BUILD_CLP_S_TIMESTAMP_PARSER CLP_BUILD_CLP_S_TIMESTAMPPATTERN ) endfunction() @@ -228,6 +230,7 @@ function(validate_clp_s_archivewriter_dependencies) validate_clp_dependencies_for_target(CLP_BUILD_CLP_S_ARCHIVEWRITER CLP_BUILD_CLP_S_CLP_DEPENDENCIES CLP_BUILD_CLP_S_IO + CLP_BUILD_CLP_S_TIMESTAMP_PARSER CLP_BUILD_CLP_S_TIMESTAMPPATTERN ) endfunction() diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 70e5c44f91..3c669a2eac 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -212,8 +212,12 @@ BaseColumnReader* ArchiveReader::append_reader_column(SchemaReader& reader, int3 case NodeType::UnstructuredArray: column_reader = new ClpStringColumnReader(column_id, m_var_dict, m_array_dict, true); break; - case NodeType::DateString: - column_reader = new DateStringColumnReader(column_id, get_timestamp_dictionary()); + case NodeType::DeprecatedDateString: + column_reader + = new DeprecatedDateStringColumnReader(column_id, get_timestamp_dictionary()); + break; + case NodeType::Timestamp: + column_reader = new TimestampColumnReader(column_id, get_timestamp_dictionary()); break; // No need to push columns without associated object readers into the SchemaReader. case NodeType::Metadata: @@ -268,10 +272,11 @@ void ArchiveReader::append_unordered_reader_columns( case NodeType::Boolean: column_reader = new BooleanColumnReader(column_id); break; - // UnstructuredArray and DateString currently aren't supported as part of any unordered - // object, so we disregard them here + // UnstructuredArray, DeprecatedDateString, and Timestamp currently aren't supported as + // part of any unordered object, so we disregard them here case NodeType::UnstructuredArray: - case NodeType::DateString: + case NodeType::DeprecatedDateString: + case NodeType::Timestamp: // No need to push columns without associated object readers into the SchemaReader. case NodeType::StructuredArray: case NodeType::Object: diff --git a/components/core/src/clp_s/ArchiveReader.hpp b/components/core/src/clp_s/ArchiveReader.hpp index 7b13afcfb2..7cf1e51d70 100644 --- a/components/core/src/clp_s/ArchiveReader.hpp +++ b/components/core/src/clp_s/ArchiveReader.hpp @@ -151,6 +151,14 @@ class ArchiveReader { */ bool has_log_order() { return m_log_event_idx_column_id >= 0; } + /** + * @return Whether this archive can contain columns with the deprecated DateString timestamp + * format. + */ + [[nodiscard]] auto has_deprecated_timestamp_format() const -> bool { + return get_header().has_deprecated_timestamp_format(); + } + private: /** * Initializes a schema reader passed by reference to become a reader for a given schema. diff --git a/components/core/src/clp_s/ArchiveReaderAdaptor.cpp b/components/core/src/clp_s/ArchiveReaderAdaptor.cpp index a38bf40554..d0507a3d1c 100644 --- a/components/core/src/clp_s/ArchiveReaderAdaptor.cpp +++ b/components/core/src/clp_s/ArchiveReaderAdaptor.cpp @@ -75,7 +75,10 @@ ArchiveReaderAdaptor::try_read_archive_file_info(ZstdDecompressor& decompressor, ErrorCode ArchiveReaderAdaptor::try_read_timestamp_dictionary(ZstdDecompressor& decompressor, size_t size) { - return m_timestamp_dictionary->read(decompressor); + return m_timestamp_dictionary->read( + decompressor, + m_archive_header.has_deprecated_timestamp_format() + ); } ErrorCode ArchiveReaderAdaptor::try_read_archive_info(ZstdDecompressor& decompressor, size_t size) { diff --git a/components/core/src/clp_s/ArchiveWriter.cpp b/components/core/src/clp_s/ArchiveWriter.cpp index e8ed88dc6d..073b44539d 100644 --- a/components/core/src/clp_s/ArchiveWriter.cpp +++ b/components/core/src/clp_s/ArchiveWriter.cpp @@ -333,12 +333,13 @@ void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, Schema const& std::make_unique(id, m_var_dict, m_array_dict) ); break; - case NodeType::DateString: - writer->append_column(std::make_unique(id)); - break; case NodeType::DeltaInteger: writer->append_column(std::make_unique(id)); break; + case NodeType::Timestamp: + writer->append_column(std::make_unique(id)); + break; + case NodeType::DeprecatedDateString: case NodeType::Metadata: case NodeType::NullValue: case NodeType::Object: diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp index ea31257840..1268670d34 100644 --- a/components/core/src/clp_s/ArchiveWriter.hpp +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -193,34 +193,49 @@ class ArchiveWriter { int32_t add_schema(Schema const& schema) { return m_schema_map.add_schema(schema); } /** - * Ingests a timestamp entry from a string + * Ingests a timestamp entry from a string. * @param key * @param node_id * @param timestamp - * @param pattern_id - * @return the epoch time corresponding to the string timestamp + * @param is_json_literal + * @return Forwards `TimestampDictionaryWriter::ingest_string_timestamp`'s return values. */ - epochtime_t ingest_timestamp_entry( + [[nodiscard]] auto ingest_string_timestamp( std::string_view key, int32_t node_id, std::string_view timestamp, - uint64_t& pattern_id - ) { - return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id); + bool is_json_literal + ) -> std::pair { + return m_timestamp_dict.ingest_string_timestamp(key, node_id, timestamp, is_json_literal); } /** - * Ingests a timestamp entry from a number - * @param column_key + * Ingests a numeric JSON entry. + * @param key * @param node_id * @param timestamp + * @return Forwards `TimestampDictionaryWriter::ingest_numeric_json_timestamp`'s return values. */ - void ingest_timestamp_entry(std::string_view key, int32_t node_id, double timestamp) { - m_timestamp_dict.ingest_entry(key, node_id, timestamp); + [[nodiscard]] auto + ingest_numeric_json_timestamp(std::string_view key, int32_t node_id, std::string_view timestamp) + -> std::pair { + return m_timestamp_dict.ingest_numeric_json_timestamp(key, node_id, timestamp); } - void ingest_timestamp_entry(std::string_view key, int32_t node_id, int64_t timestamp) { - m_timestamp_dict.ingest_entry(key, node_id, timestamp); + /** + * Ingests an unknown precision epoch timestamp. + * @param key + * @param node_id + * @param timestamp + * @return Forwards `TimestampDictionaryWriter::ingest_unknown_precision_epoch_timestamp`'s + * return values. + */ + [[nodiscard]] auto ingest_unknown_precision_epoch_timestamp( + std::string_view key, + int32_t node_id, + int64_t timestamp + ) -> std::pair { + return m_timestamp_dict.ingest_unknown_precision_epoch_timestamp(key, node_id, timestamp); } /** diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 77cd106759..9c23f2b553 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -281,13 +281,14 @@ if(CLP_BUILD_CLP_S_ARCHIVEWRITER) absl::flat_hash_map clp_s::clp_dependencies clp_s::io + clp_s::timestamp_parser + clp_s::timestamp_pattern msgpack-cxx nlohmann_json::nlohmann_json simdjson::simdjson ystdlib::error_handling PRIVATE Boost::url - clp_s::timestamp_pattern ${CURL_LIBRARIES} fmt::fmt spdlog::spdlog @@ -345,13 +346,14 @@ if(CLP_BUILD_CLP_S_ARCHIVEREADER) absl::flat_hash_map clp::string_utils clp_s::io + clp_s::timestamp_parser + clp_s::timestamp_pattern msgpack-cxx nlohmann_json::nlohmann_json ystdlib::error_handling PRIVATE Boost::url clp_s::clp_dependencies - clp_s::timestamp_pattern ${CURL_LIBRARIES} fmt::fmt spdlog::spdlog diff --git a/components/core/src/clp_s/ColumnReader.cpp b/components/core/src/clp_s/ColumnReader.cpp index 41abed057e..f946786024 100644 --- a/components/core/src/clp_s/ColumnReader.cpp +++ b/components/core/src/clp_s/ColumnReader.cpp @@ -226,31 +226,31 @@ int64_t VariableStringColumnReader::get_variable_id(uint64_t cur_message) { return m_variables[cur_message]; } -void DateStringColumnReader::load(BufferViewReader& reader, uint64_t num_messages) { +void DeprecatedDateStringColumnReader::load(BufferViewReader& reader, uint64_t num_messages) { m_timestamps = reader.read_unaligned_span(num_messages); m_timestamp_encodings = reader.read_unaligned_span(num_messages); } -std::variant DateStringColumnReader::extract_value( +std::variant DeprecatedDateStringColumnReader::extract_value( uint64_t cur_message ) { - return m_timestamp_dict->get_string_encoding( + return m_timestamp_dict->get_deprecated_timestamp_string_encoding( m_timestamps[cur_message], m_timestamp_encodings[cur_message] ); } -void DateStringColumnReader::extract_string_value_into_buffer( +void DeprecatedDateStringColumnReader::extract_string_value_into_buffer( uint64_t cur_message, std::string& buffer ) { - buffer.append(m_timestamp_dict->get_string_encoding( + buffer.append(m_timestamp_dict->get_deprecated_timestamp_string_encoding( m_timestamps[cur_message], m_timestamp_encodings[cur_message] )); } -epochtime_t DateStringColumnReader::get_encoded_time(uint64_t cur_message) { +epochtime_t DeprecatedDateStringColumnReader::get_encoded_time(uint64_t cur_message) { return m_timestamps[cur_message]; } diff --git a/components/core/src/clp_s/ColumnReader.hpp b/components/core/src/clp_s/ColumnReader.hpp index d1138d5c96..3563642581 100644 --- a/components/core/src/clp_s/ColumnReader.hpp +++ b/components/core/src/clp_s/ColumnReader.hpp @@ -327,20 +327,23 @@ class VariableStringColumnReader : public BaseColumnReader { UnalignedMemSpan m_variables; }; -class DateStringColumnReader : public BaseColumnReader { +class DeprecatedDateStringColumnReader : public BaseColumnReader { public: // Constructor - DateStringColumnReader(int32_t id, std::shared_ptr timestamp_dict) + DeprecatedDateStringColumnReader( + int32_t id, + std::shared_ptr timestamp_dict + ) : BaseColumnReader(id), m_timestamp_dict(std::move(timestamp_dict)) {} // Destructor - ~DateStringColumnReader() override = default; + ~DeprecatedDateStringColumnReader() override = default; // Methods inherited from BaseColumnReader void load(BufferViewReader& reader, uint64_t num_messages) override; - NodeType get_type() override { return NodeType::DateString; } + NodeType get_type() override { return NodeType::DeprecatedDateString; } std::variant extract_value( uint64_t cur_message diff --git a/components/core/src/clp_s/ColumnWriter.cpp b/components/core/src/clp_s/ColumnWriter.cpp index b0ec27fe05..a2898cd07d 100644 --- a/components/core/src/clp_s/ColumnWriter.cpp +++ b/components/core/src/clp_s/ColumnWriter.cpp @@ -165,21 +165,6 @@ void VariableStringColumnWriter::store(ZstdCompressor& compressor) { compressor.write(reinterpret_cast(m_var_dict_ids.data()), size); } -size_t DateStringColumnWriter::add_value(ParsedMessage::variable_t& value) { - auto encoded_timestamp = std::get>(value); - m_timestamps.push_back(encoded_timestamp.second); - m_timestamp_encodings.push_back(encoded_timestamp.first); - return 2 * sizeof(int64_t); - ; -} - -void DateStringColumnWriter::store(ZstdCompressor& compressor) { - size_t timestamps_size = m_timestamps.size() * sizeof(int64_t); - compressor.write(reinterpret_cast(m_timestamps.data()), timestamps_size); - size_t encodings_size = m_timestamp_encodings.size() * sizeof(int64_t); - compressor.write(reinterpret_cast(m_timestamp_encodings.data()), encodings_size); -} - auto TimestampColumnWriter::add_value(ParsedMessage::variable_t& value) -> size_t { auto const [timestamp, encoding] = std::get>(value); auto const encoded_timestamp_size{m_timestamps.add_value(timestamp)}; diff --git a/components/core/src/clp_s/ColumnWriter.hpp b/components/core/src/clp_s/ColumnWriter.hpp index 8b32f23669..673ab124d8 100644 --- a/components/core/src/clp_s/ColumnWriter.hpp +++ b/components/core/src/clp_s/ColumnWriter.hpp @@ -242,24 +242,6 @@ class VariableStringColumnWriter : public BaseColumnWriter { std::vector m_var_dict_ids; }; -class DateStringColumnWriter : public BaseColumnWriter { -public: - // Constructor - explicit DateStringColumnWriter(int32_t id) : BaseColumnWriter(id) {} - - // Destructor - ~DateStringColumnWriter() override = default; - - // Methods inherited from BaseColumnWriter - size_t add_value(ParsedMessage::variable_t& value) override; - - void store(ZstdCompressor& compressor) override; - -private: - std::vector m_timestamps; - std::vector m_timestamp_encodings; -}; - class TimestampColumnWriter : public BaseColumnWriter { public: // Constructor diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index c0e2123e7f..7115a4ea53 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -507,16 +507,43 @@ void JsonParser::parse_line( i64_value = number_value.get_int64(); } - node_id = m_archive_writer - ->add_node(node_id_stack.top(), NodeType::Integer, cur_key); - m_current_parsed_message.add_value(node_id, i64_value); + node_id = m_archive_writer->add_node( + node_id_stack.top(), + matches_timestamp ? NodeType::Timestamp : NodeType::Integer, + cur_key + ); if (matches_timestamp) { - m_archive_writer - ->ingest_timestamp_entry(m_timestamp_key, node_id, i64_value); + m_current_parsed_message.add_value( + node_id, + m_archive_writer->ingest_unknown_precision_epoch_timestamp( + m_timestamp_key, + node_id, + i64_value + ) + ); + } else { + m_current_parsed_message.add_value(node_id, i64_value); } } else { auto const double_value{number_value.get_double()}; - if (m_retain_float_format) { + if (matches_timestamp) { + node_id = m_archive_writer->add_node( + node_id_stack.top(), + NodeType::Timestamp, + cur_key + ); + auto const double_value_str{ + trim_trailing_whitespace(line.raw_json_token()) + }; + m_current_parsed_message.add_value( + node_id, + m_archive_writer->ingest_numeric_json_timestamp( + m_timestamp_key, + node_id, + double_value_str + ) + ); + } else if (m_retain_float_format) { auto double_value_str{trim_trailing_whitespace(line.raw_json_token())}; auto const float_format_result{get_float_encoding(double_value_str)}; if (false == float_format_result.has_error() @@ -546,33 +573,32 @@ void JsonParser::parse_line( ->add_node(node_id_stack.top(), NodeType::Float, cur_key); m_current_parsed_message.add_value(node_id, double_value); } - if (matches_timestamp) { - m_archive_writer - ->ingest_timestamp_entry(m_timestamp_key, node_id, double_value); - } } m_current_schema.insert_ordered(node_id); break; } case simdjson::ondemand::json_type::string: { - std::string_view value = line.get_string(true); - auto const matches_timestamp - = m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key); - if (matches_timestamp) { - node_id = m_archive_writer->add_node( - node_id_stack.top(), - NodeType::DateString, - cur_key - ); - uint64_t encoding_id{0}; - epochtime_t timestamp = m_archive_writer->ingest_timestamp_entry( - m_timestamp_key, + if (m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key)) { + auto const raw_timestamp_literal{ + trim_trailing_whitespace(line.raw_json_token()) + }; + node_id = m_archive_writer + ->add_node(node_id_stack.top(), NodeType::Timestamp, cur_key); + m_current_parsed_message.add_value( node_id, - value, - encoding_id + m_archive_writer->ingest_string_timestamp( + m_timestamp_key, + node_id, + raw_timestamp_literal, + true + ) ); - m_current_parsed_message.add_value(node_id, encoding_id, timestamp); - } else if (value.find(' ') != std::string::npos) { + m_current_schema.insert_ordered(node_id); + break; + } + + std::string_view value = line.get_string(true); + if (value.find(' ') != std::string::npos) { node_id = m_archive_writer ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key); m_current_parsed_message.add_value(node_id, value); @@ -581,7 +607,6 @@ void JsonParser::parse_line( ->add_node(node_id_stack.top(), NodeType::VarString, cur_key); m_current_parsed_message.add_value(node_id, value); } - m_current_schema.insert_ordered(node_id); break; } @@ -1054,14 +1079,7 @@ auto JsonParser::adjust_archive_node_type_for_timestamp(NodeType node_type, bool if (false == matches_timestamp) { return node_type; } - - switch (node_type) { - case NodeType::ClpString: - case NodeType::VarString: - return NodeType::DateString; - default: - return node_type; - } + return NodeType::Timestamp; } template @@ -1182,17 +1200,34 @@ void JsonParser::parse_kv_log_event_subtree( case NodeType::Integer: { auto const i64_value = pair.second.value().get_immutable_view(); - m_current_parsed_message.add_value(node_id, i64_value); if (matches_timestamp) { - m_archive_writer->ingest_timestamp_entry(m_timestamp_key, node_id, i64_value); + m_current_parsed_message.add_value( + node_id, + m_archive_writer->ingest_unknown_precision_epoch_timestamp( + m_timestamp_key, + node_id, + i64_value + ) + ); + } else { + m_current_parsed_message.add_value(node_id, i64_value); } } break; case NodeType::Float: { auto const d_value = pair.second.value().get_immutable_view(); - m_current_parsed_message.add_value(node_id, d_value); if (matches_timestamp) { - m_archive_writer->ingest_timestamp_entry(m_timestamp_key, node_id, d_value); + auto const timestamp_str{fmt::format("{:.9f}", d_value)}; + m_current_parsed_message.add_value( + node_id, + m_archive_writer->ingest_numeric_json_timestamp( + m_timestamp_key, + node_id, + timestamp_str + ) + ); + } else { + m_current_parsed_message.add_value(node_id, d_value); } } break; case NodeType::Boolean: { @@ -1203,14 +1238,15 @@ void JsonParser::parse_kv_log_event_subtree( case NodeType::VarString: { auto const var_value{pair.second.value().get_immutable_view()}; if (matches_timestamp) { - uint64_t encoding_id{}; - auto const timestamp = m_archive_writer->ingest_timestamp_entry( - m_timestamp_key, + m_current_parsed_message.add_value( node_id, - var_value, - encoding_id + m_archive_writer->ingest_string_timestamp( + m_timestamp_key, + node_id, + var_value, + false + ) ); - m_current_parsed_message.add_value(node_id, encoding_id, timestamp); } else { m_current_parsed_message.add_value(node_id, var_value); } @@ -1254,14 +1290,15 @@ void JsonParser::parse_kv_log_event_subtree( ); } - uint64_t encoding_id{}; - auto const timestamp = m_archive_writer->ingest_timestamp_entry( - m_timestamp_key, + m_current_parsed_message.add_value( node_id, - decoding_result.value(), - encoding_id + m_archive_writer->ingest_string_timestamp( + m_timestamp_key, + node_id, + decoding_result.value(), + false + ) ); - m_current_parsed_message.add_value(node_id, encoding_id, timestamp); } break; case NodeType::UnstructuredArray: { if (pair.second.value().is()) { diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp index 1f74d11516..6497d54aa7 100644 --- a/components/core/src/clp_s/JsonSerializer.hpp +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -31,6 +31,7 @@ class JsonSerializer { EndArray, BeginUnnamedObject, BeginUnnamedArray, + AddLiteralField, }; static int64_t const cReservedLength = 4096; diff --git a/components/core/src/clp_s/ParsedMessage.hpp b/components/core/src/clp_s/ParsedMessage.hpp index 00feb02719..63de2f6d5e 100644 --- a/components/core/src/clp_s/ParsedMessage.hpp +++ b/components/core/src/clp_s/ParsedMessage.hpp @@ -23,7 +23,6 @@ class ParsedMessage { clp::ffi::EightByteEncodedTextAst, clp::ffi::FourByteEncodedTextAst, bool, - std::pair, std::pair, std::pair>; @@ -50,16 +49,6 @@ class ParsedMessage { m_message.emplace(node_id, std::string{value}); } - /** - * Adds a timestamp value and its encoding to the message for a given MST node ID. - * @param node_id - * @param encoding_id - * @param value - */ - inline void add_value(int32_t node_id, uint64_t encoding_id, epochtime_t value) { - m_message.emplace(node_id, std::make_pair(encoding_id, value)); - } - /** * Adds a float and its format to the message for a given MST node ID. * @param node_id diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index a2a4ab6dcb..94fbfb2f31 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -18,11 +18,20 @@ void SchemaReader::append_unordered_column(BaseColumnReader* column_reader) { } void SchemaReader::mark_column_as_timestamp(BaseColumnReader* column_reader) { + constexpr epochtime_t cNanosecondsInMillisecond{1000 * 1000LL}; + constexpr epochtime_t cMillisecondsInSecond{1000LL}; m_timestamp_column = column_reader; - if (m_timestamp_column->get_type() == NodeType::DateString) { + if (m_timestamp_column->get_type() == NodeType::Timestamp) { m_get_timestamp = [this]() { - return static_cast(m_timestamp_column) + return static_cast(m_timestamp_column) + ->get_encoded_time(m_cur_message) + / cNanosecondsInMillisecond; + }; + } else if (m_timestamp_column->get_type() == NodeType::DeprecatedDateString) { + m_get_timestamp = [this]() { + return static_cast(m_timestamp_column) ->get_encoded_time(m_cur_message); + ; }; } else if (m_timestamp_column->get_type() == NodeType::Integer) { m_get_timestamp = [this]() { @@ -39,6 +48,7 @@ void SchemaReader::mark_column_as_timestamp(BaseColumnReader* column_reader) { return static_cast( std::get(static_cast(m_timestamp_column) ->extract_value(m_cur_message)) + * cMillisecondsInSecond ); }; } @@ -186,6 +196,13 @@ auto SchemaReader::generate_json_string(uint64_t message_index) -> std::string { m_json_serializer.append_value("null"); break; } + case JsonSerializer::Op::AddLiteralField: { + column = m_reordered_columns[column_id_index++]; + auto const key{m_global_schema_tree->get_node(column->get_id()).get_key_name()}; + m_json_serializer.append_key(key); + m_json_serializer.append_value_from_column(column, message_index); + break; + } } } @@ -474,9 +491,10 @@ size_t SchemaReader::generate_structured_array_template( m_json_serializer.add_op(JsonSerializer::Op::AddNullValue); break; } - case NodeType::DateString: + case NodeType::DeprecatedDateString: case NodeType::UnstructuredArray: case NodeType::Metadata: + case NodeType::Timestamp: case NodeType::Unknown: break; } @@ -566,9 +584,10 @@ size_t SchemaReader::generate_structured_object_template( m_json_serializer.add_special_key(node.get_key_name()); break; } - case NodeType::DateString: + case NodeType::DeprecatedDateString: case NodeType::UnstructuredArray: case NodeType::Metadata: + case NodeType::Timestamp: case NodeType::Unknown: break; } @@ -672,11 +691,16 @@ void SchemaReader::generate_json_template(int32_t id) { } case NodeType::ClpString: case NodeType::VarString: - case NodeType::DateString: { + case NodeType::DeprecatedDateString: { m_json_serializer.add_op(JsonSerializer::Op::AddStringField); m_reordered_columns.push_back(m_column_map[child_global_id]); break; } + case NodeType::Timestamp: { + m_json_serializer.add_op(JsonSerializer::Op::AddLiteralField); + m_reordered_columns.emplace_back(m_column_map.at(child_global_id)); + break; + } case NodeType::NullValue: { m_json_serializer.add_op(JsonSerializer::Op::AddNullField); m_json_serializer.add_special_key(key); diff --git a/components/core/src/clp_s/SchemaTree.cpp b/components/core/src/clp_s/SchemaTree.cpp index 0cac20aef5..83b9e667de 100644 --- a/components/core/src/clp_s/SchemaTree.cpp +++ b/components/core/src/clp_s/SchemaTree.cpp @@ -27,7 +27,8 @@ auto node_to_literal_type(NodeType type) -> clp_s::search::ast::LiteralType { return clp_s::search::ast::LiteralType::ArrayT; case NodeType::NullValue: return clp_s::search::ast::LiteralType::NullT; - case NodeType::DateString: + case NodeType::DeprecatedDateString: + case NodeType::Timestamp: return clp_s::search::ast::LiteralType::TimestampT; case NodeType::Metadata: case NodeType::Unknown: diff --git a/components/core/src/clp_s/SchemaTree.hpp b/components/core/src/clp_s/SchemaTree.hpp index 0617ea28a2..4e3000a094 100644 --- a/components/core/src/clp_s/SchemaTree.hpp +++ b/components/core/src/clp_s/SchemaTree.hpp @@ -38,7 +38,7 @@ enum class NodeType : uint8_t { Object, UnstructuredArray, NullValue, - DateString, + DeprecatedDateString, StructuredArray, Metadata, DeltaInteger, diff --git a/components/core/src/clp_s/SingleFileArchiveDefs.hpp b/components/core/src/clp_s/SingleFileArchiveDefs.hpp index 2c8354ca72..7e23b6ef38 100644 --- a/components/core/src/clp_s/SingleFileArchiveDefs.hpp +++ b/components/core/src/clp_s/SingleFileArchiveDefs.hpp @@ -43,12 +43,15 @@ constexpr auto decompose_archive_version(uint32_t archive_version) // define the version constexpr uint8_t cArchiveMajorVersion = 0; -constexpr uint8_t cArchiveMinorVersion = 4; -constexpr uint16_t cArchivePatchVersion = 1; +constexpr uint8_t cArchiveMinorVersion = 5; +constexpr uint16_t cArchivePatchVersion = 0; constexpr uint32_t cArchiveVersion{ make_archive_version(cArchiveMajorVersion, cArchiveMinorVersion, cArchivePatchVersion) }; +// Format version markers for backwards compatibility. +constexpr uint32_t cDeprecatedDateStringFormatVersionMarker{make_archive_version(0, 5, 0)}; + // define the magic number constexpr std::array cStructuredSFAMagicNumber{0xFD, 0x2F, 0xC5, 0x30}; @@ -74,6 +77,14 @@ struct ArchiveHeader { ); } + /** + * @return Whether this archive can contain columns with the deprecated DateString timestamp + * format. + */ + [[nodiscard]] auto has_deprecated_timestamp_format() const -> bool { + return version < cDeprecatedDateStringFormatVersionMarker; + } + uint8_t magic_number[4]{}; uint32_t version{}; uint64_t uncompressed_size{}; diff --git a/components/core/src/clp_s/TimestampDictionaryReader.cpp b/components/core/src/clp_s/TimestampDictionaryReader.cpp index a6bc40ac84..6a0b37a5eb 100644 --- a/components/core/src/clp_s/TimestampDictionaryReader.cpp +++ b/components/core/src/clp_s/TimestampDictionaryReader.cpp @@ -1,11 +1,24 @@ #include "TimestampDictionaryReader.hpp" +#include +#include +#include #include +#include + +#include + +#include #include "search/ast/SearchUtils.hpp" +#include "SingleFileArchiveDefs.hpp" +#include "TimestampPattern.hpp" namespace clp_s { -ErrorCode TimestampDictionaryReader::read(ZstdDecompressor& decompressor) { +auto TimestampDictionaryReader::read( + ZstdDecompressor& decompressor, + bool has_deprecated_timestamp_format +) -> ErrorCode { ErrorCode error; uint64_t range_index_size; error = decompressor.try_read_numeric_value(range_index_size); @@ -64,16 +77,47 @@ ErrorCode TimestampDictionaryReader::read(ZstdDecompressor& decompressor) { if (ErrorCodeSuccess != error) { return error; } - m_patterns[id] = TimestampPattern(0, pattern); + + if (has_deprecated_timestamp_format) { + m_deprecated_patterns.emplace(id, TimestampPattern(0, pattern)); + continue; + } + + auto timestamp_pattern_result{timestamp_parser::TimestampPattern::create(pattern)}; + if (timestamp_pattern_result.has_error()) { + auto const& timestamp_error{timestamp_pattern_result.error()}; + SPDLOG_ERROR( + "Error loading timestamp pattern `{}` - {} - {}", + pattern, + timestamp_error.category().name(), + timestamp_error.message() + ); + return ErrorCodeCorrupt; + } + m_timestamp_patterns.emplace(id, std::move(timestamp_pattern_result.value())); } return ErrorCodeSuccess; } -std::string -TimestampDictionaryReader::get_string_encoding(epochtime_t epoch, uint64_t format_id) const { +auto TimestampDictionaryReader::get_deprecated_timestamp_string_encoding( + epochtime_t epoch, + uint64_t format_id +) const -> std::string { std::string ret; - m_patterns.at(format_id).insert_formatted_timestamp(epoch, ret); + m_deprecated_patterns.at(format_id).insert_formatted_timestamp(epoch, ret); return ret; } + +void TimestampDictionaryReader::append_timestamp_to_buffer( + epochtime_t timestamp, + uint64_t format_id, + std::string& buffer +) const { + auto const& pattern{m_timestamp_patterns.at(format_id)}; + auto const marshal_result{timestamp_parser::marshal_timestamp(timestamp, pattern, buffer)}; + if (marshal_result.has_error()) { + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } +} } // namespace clp_s diff --git a/components/core/src/clp_s/TimestampDictionaryReader.hpp b/components/core/src/clp_s/TimestampDictionaryReader.hpp index 0a0a2665f5..8440faa617 100644 --- a/components/core/src/clp_s/TimestampDictionaryReader.hpp +++ b/components/core/src/clp_s/TimestampDictionaryReader.hpp @@ -1,9 +1,15 @@ #ifndef CLP_S_TIMESTAMPDICTIONARYREADER_HPP #define CLP_S_TIMESTAMPDICTIONARYREADER_HPP -#include +#include #include +#include +#include +#include #include +#include + +#include #include "FileReader.hpp" #include "TimestampEntry.hpp" @@ -23,18 +29,24 @@ class TimestampDictionaryReader { // Methods /** - * Reads the timestamp dictionary from a decompressor + * Reads the timestamp dictionary from a decompressor. * @param decompressor + * @param has_deprecated_timestamp_format * @return ErrorCodeSuccess on success, and the relevant ErrorCode otherwise */ - ErrorCode read(ZstdDecompressor& decompressor); + [[nodiscard]] auto read(ZstdDecompressor& decompressor, bool has_deprecated_timestamp_format) + -> ErrorCode; /** - * Gets the string encoding for a given epoch and format ID + * Gets the string encoding for a given epoch and format ID by interpreting the pattern + * identified by `format_id` as a `clp_s::TimestampPattern`. * @param epoch * @param format_id + * @return The string encoding for the given epoch and format ID. */ - std::string get_string_encoding(epochtime_t epoch, uint64_t format_id) const; + [[nodiscard]] auto + get_deprecated_timestamp_string_encoding(epochtime_t epoch, uint64_t format_id) const + -> std::string; /** * Marshals and appends the `timestamp` to the `buffer` by interpreting the timestamp pattern @@ -49,15 +61,7 @@ class TimestampDictionaryReader { epochtime_t timestamp, uint64_t format_id, std::string& buffer - ) const { /*NO-OP until follow-up PR*/ } - - /** - * Gets iterators for the timestamp patterns - * @return begin and end iterators for the timestamp patterns - */ - auto pattern_begin() const { return m_patterns.begin(); } - - auto pattern_end() const { return m_patterns.end(); } + ) const; /** * Gets iterators for the column to range mappings @@ -77,12 +81,12 @@ class TimestampDictionaryReader { } private: - using id_to_pattern_t = std::map; using tokenized_column_to_range_t = std::vector, TimestampEntry*>>; // Variables - id_to_pattern_t m_patterns; + std::unordered_map m_deprecated_patterns; + std::unordered_map m_timestamp_patterns; std::vector m_entries; tokenized_column_to_range_t m_tokenized_column_to_range; diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp index 0c62a1964e..4a5af79f39 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp @@ -2,133 +2,193 @@ #include #include +#include #include +#include -namespace clp_s { -void TimestampDictionaryWriter::write_timestamp_entries( - std::map const& ranges, - std::stringstream& stream -) { - write_numeric_value(stream, ranges.size()); - - for (auto const& range : ranges) { - range.second.write_to_stream(stream); - } -} +#include +#include + +#include +namespace clp_s { void TimestampDictionaryWriter::write(std::stringstream& stream) { - merge_range(); - write_timestamp_entries(m_column_key_to_range, stream); + write_numeric_value(stream, m_column_id_to_range.size()); + for (auto const& [id, range] : m_column_id_to_range) { + range.write_to_stream(stream); + } - write_numeric_value(stream, m_pattern_to_id.size()); - for (auto& it : m_pattern_to_id) { - // write pattern ID - write_numeric_value(stream, it.second); + write_numeric_value( + stream, + m_string_pattern_and_id_pairs.size() + m_numeric_pattern_to_id.size() + ); + for (auto const& [quoted_pattern, pattern_id] : m_string_pattern_and_id_pairs) { + write_numeric_value(stream, pattern_id); - std::string const& pattern = it.first->get_format(); - write_numeric_value(stream, pattern.length()); - stream.write(pattern.data(), pattern.size()); + auto const raw_pattern{quoted_pattern.get_pattern()}; + write_numeric_value(stream, raw_pattern.length()); + stream.write(raw_pattern.data(), raw_pattern.size()); } -} -uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* pattern) { - auto it = m_pattern_to_id.find(pattern); - if (m_pattern_to_id.end() == it) { - uint64_t id = m_next_id++; - m_pattern_to_id.emplace(pattern, id); - return id; + for (auto const& [raw_pattern, pattern_and_id] : m_numeric_pattern_to_id) { + write_numeric_value(stream, pattern_and_id.second); + + write_numeric_value(stream, raw_pattern.size()); + stream.write(raw_pattern.data(), raw_pattern.size()); } - return it->second; } -epochtime_t TimestampDictionaryWriter::ingest_entry( +auto TimestampDictionaryWriter::ingest_string_timestamp( std::string_view key, int32_t node_id, std::string_view timestamp, - uint64_t& pattern_id -) { - epochtime_t ret; - size_t timestamp_begin_pos = 0, timestamp_end_pos = 0; - TimestampPattern const* pattern{nullptr}; + bool is_json_literal +) -> std::pair { + auto& [_, timestamp_entry] = *m_column_id_to_range.try_emplace(node_id, key, node_id).first; // Try parsing the timestamp as one of the previously seen timestamp patterns - for (auto it : m_pattern_to_id) { - if (it.first->parse_timestamp(timestamp, ret, timestamp_begin_pos, timestamp_end_pos)) { - pattern = it.first; - pattern_id = it.second; - break; + for (auto const& [quoted_pattern, pattern_id] : m_string_pattern_and_id_pairs) { + auto const parsing_result{timestamp_parser::parse_timestamp( + timestamp, + quoted_pattern, + is_json_literal, + m_generated_pattern + )}; + if (parsing_result.has_error()) { + continue; } + auto const epoch_timestamp{parsing_result.value().first}; + timestamp_entry.ingest_timestamp(epoch_timestamp); + return {epoch_timestamp, pattern_id}; } // Fall back to consulting all known timestamp patterns - if (nullptr == pattern) { - pattern = TimestampPattern::search_known_ts_patterns( - timestamp, - ret, - timestamp_begin_pos, - timestamp_end_pos - ); - pattern_id = get_pattern_id(pattern); - } - - if (nullptr == pattern) { + auto const parsing_result{timestamp_parser::search_known_timestamp_patterns( + timestamp, + m_quoted_timestamp_patterns, + is_json_literal, + m_generated_pattern + )}; + if (false == parsing_result.has_value()) { + SPDLOG_ERROR("Failed to parse timestamp `{}` against known timestamp patterns.", timestamp); throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); } - auto entry = m_column_id_to_range.find(node_id); - if (entry == m_column_id_to_range.end()) { - TimestampEntry new_entry(key); - new_entry.ingest_timestamp(ret); - m_column_id_to_range.emplace(node_id, std::move(new_entry)); - } else { - entry->second.ingest_timestamp(ret); + auto const [epoch_timestamp, pattern] = parsing_result.value(); + timestamp_entry.ingest_timestamp(epoch_timestamp); + auto const quoted_pattern_result{timestamp_parser::TimestampPattern::create(pattern)}; + if (quoted_pattern_result.has_error()) { + auto const error{quoted_pattern_result.error()}; + SPDLOG_ERROR( + "Failed to create timestamp pattern: {} - {}.", + error.category().name(), + error.message() + ); + throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); } - return ret; + auto const new_pattern_id{m_next_id++}; + m_string_pattern_and_id_pairs.emplace_back( + std::move(quoted_pattern_result.value()), + new_pattern_id + ); + return {epoch_timestamp, new_pattern_id}; } -void -TimestampDictionaryWriter::ingest_entry(std::string_view key, int32_t node_id, double timestamp) { - auto entry = m_column_id_to_range.find(node_id); - if (entry == m_column_id_to_range.end()) { - TimestampEntry new_entry(key); - new_entry.ingest_timestamp(timestamp); - m_column_id_to_range.emplace(node_id, std::move(new_entry)); - } else { - entry->second.ingest_timestamp(timestamp); +auto TimestampDictionaryWriter::ingest_numeric_json_timestamp( + std::string_view key, + int32_t node_id, + std::string_view timestamp +) -> std::pair { + auto& [_, timestamp_entry] = *m_column_id_to_range.try_emplace(node_id, key, node_id).first; + + for (auto const& [raw_pattern, pattern_and_id] : m_numeric_pattern_to_id) { + auto const& [pattern, id] = pattern_and_id; + auto const parsing_result{timestamp_parser::parse_timestamp( + timestamp, + pattern_and_id.first, + true, + m_generated_pattern + )}; + if (parsing_result.has_error()) { + continue; + } + auto const epoch_timestamp{parsing_result.value().first}; + timestamp_entry.ingest_timestamp(epoch_timestamp); + return {epoch_timestamp, pattern_and_id.second}; + } + + auto const optional_parsed_timestamp{timestamp_parser::search_known_timestamp_patterns( + timestamp, + m_numeric_timestamp_patterns, + true, + m_generated_pattern + )}; + if (false == optional_parsed_timestamp.has_value()) { + SPDLOG_ERROR("Failed to parse timestamp `{}` against known timestamp patterns.", timestamp); + throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); } -} -void -TimestampDictionaryWriter::ingest_entry(std::string_view key, int32_t node_id, int64_t timestamp) { - auto entry = m_column_id_to_range.find(node_id); - if (entry == m_column_id_to_range.end()) { - TimestampEntry new_entry(key); - new_entry.ingest_timestamp(timestamp); - m_column_id_to_range.emplace(node_id, std::move(new_entry)); - } else { - entry->second.ingest_timestamp(timestamp); + auto const [epoch_timestamp, pattern] = optional_parsed_timestamp.value(); + timestamp_entry.ingest_timestamp(epoch_timestamp); + auto const pattern_result{timestamp_parser::TimestampPattern::create(pattern)}; + if (pattern_result.has_error()) { + auto const error{pattern_result.error()}; + SPDLOG_ERROR( + "Failed to create timestamp pattern: {} - {}.", + error.category().name(), + error.message() + ); + throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); } + + auto const new_pattern_id{m_next_id++}; + m_numeric_pattern_to_id.emplace( + std::string{pattern}, + std::make_pair(std::move(pattern_result.value()), new_pattern_id) + ); + return {epoch_timestamp, new_pattern_id}; } -void TimestampDictionaryWriter::merge_range() { - for (auto const& it : m_column_id_to_range) { - std::string key = it.second.get_key_name(); - auto entry = m_column_key_to_range.find(key); - if (entry == m_column_key_to_range.end()) { - TimestampEntry new_entry = it.second; - new_entry.insert_column_id(it.first); - m_column_key_to_range.emplace(key, std::move(new_entry)); - } else { - entry->second.merge_range(it.second); - entry->second.insert_column_id(it.first); +auto TimestampDictionaryWriter::ingest_unknown_precision_epoch_timestamp( + std::string_view key, + int32_t node_id, + int64_t timestamp +) -> std::pair { + auto& [_, timestamp_entry] = *m_column_id_to_range.try_emplace(node_id, key, node_id).first; + + auto const [factor, precision] = timestamp_parser::estimate_timestamp_precision(timestamp); + auto const epoch_timestamp{timestamp * factor}; + timestamp_entry.ingest_timestamp(epoch_timestamp); + auto pattern{fmt::format("\\{}", precision)}; + + auto pattern_it{m_numeric_pattern_to_id.find(pattern)}; + if (m_numeric_pattern_to_id.end() == pattern_it) { + auto pattern_result{timestamp_parser::TimestampPattern::create(pattern)}; + if (pattern_result.has_error()) { + auto const error{pattern_result.error()}; + SPDLOG_ERROR( + "Failed to create timestamp pattern: {} - {}.", + error.category().name(), + error.message() + ); + throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); } + auto const new_pattern_id{m_next_id++}; + pattern_it + = m_numeric_pattern_to_id + .emplace( + std::move(pattern), + std::make_pair(std::move(pattern_result.value()), new_pattern_id) + ) + .first; } + return {epoch_timestamp, pattern_it->second.second}; } epochtime_t TimestampDictionaryWriter::get_begin_timestamp() const { - auto it = m_column_key_to_range.begin(); - if (m_column_key_to_range.end() == it) { + auto it = m_column_id_to_range.begin(); + if (m_column_id_to_range.end() == it) { // replicate behaviour of CLP return 0; } @@ -137,8 +197,8 @@ epochtime_t TimestampDictionaryWriter::get_begin_timestamp() const { } epochtime_t TimestampDictionaryWriter::get_end_timestamp() const { - auto it = m_column_key_to_range.begin(); - if (m_column_key_to_range.end() == it) { + auto it = m_column_id_to_range.begin(); + if (m_column_id_to_range.end() == it) { // replicate behaviour of CLP return 0; } @@ -148,8 +208,8 @@ epochtime_t TimestampDictionaryWriter::get_end_timestamp() const { void TimestampDictionaryWriter::clear() { m_next_id = 0; - m_pattern_to_id.clear(); - m_column_key_to_range.clear(); + m_string_pattern_and_id_pairs.clear(); + m_numeric_pattern_to_id.clear(); m_column_id_to_range.clear(); } } // namespace clp_s diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp index 7c214a39e7..8afea80e29 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp @@ -9,9 +9,11 @@ #include #include +#include + #include "SchemaTree.hpp" #include "TimestampEntry.hpp" -#include "TimestampPattern.hpp" +#include "TraceableException.hpp" namespace clp_s { class TimestampDictionaryWriter { @@ -25,7 +27,15 @@ class TimestampDictionaryWriter { }; // Constructors - TimestampDictionaryWriter() {} + explicit TimestampDictionaryWriter() { + auto quoted_patterns_result{timestamp_parser::get_all_default_quoted_timestamp_patterns()}; + auto numeric_patterns_result{timestamp_parser::get_default_numeric_timestamp_patterns()}; + if (quoted_patterns_result.has_error() || numeric_patterns_result.has_error()) { + throw OperationFailed(ErrorCode::ErrorCodeFailure, __FILENAME__, __LINE__); + } + m_quoted_timestamp_patterns = std::move(quoted_patterns_result.value()); + m_numeric_timestamp_patterns = std::move(numeric_patterns_result.value()); + } /** * Writes the timestamp dictionary to a buffered stream. @@ -34,50 +44,57 @@ class TimestampDictionaryWriter { void write(std::stringstream& stream); /** - * Gets the pattern id for a given pattern - * @param pattern - * @return the pattern id - */ - uint64_t get_pattern_id(TimestampPattern const* pattern); - - /** - * Ingests a timestamp entry + * Ingests a timestamp entry from a string. * @param key * @param node_id * @param timestamp - * @param pattern_id - * @return the epoch time corresponding to the string timestamp + * @param is_json_literal + * @return A pair containing: + * - The timestamp in epoch nanoseconds. + * - The pattern ID corresponding to the timestamp format. */ - epochtime_t ingest_entry( + [[nodiscard]] auto ingest_string_timestamp( std::string_view key, int32_t node_id, std::string_view timestamp, - uint64_t& pattern_id - ); + bool is_json_literal + ) -> std::pair; /** - * Ingests a timestamp entry - * @param column_key + * Ingests a numeric JSON entry. + * @param key * @param node_id * @param timestamp + * @return A pair containing: + * - The timestamp in epoch nanoseconds. + * - The pattern ID corresponding to the timestamp format. */ - void ingest_entry(std::string_view key, int32_t node_id, double timestamp); + [[nodiscard]] auto + ingest_numeric_json_timestamp(std::string_view key, int32_t node_id, std::string_view timestamp) + -> std::pair; - void ingest_entry(std::string_view key, int32_t node_id, int64_t timestamp); + /** + * Ingests an unknown precision epoch timestamp. + * @param key + * @param node_id + * @param timestamp + * @return A pair containing: + * - The timestamp in epoch nanoseconds. + * - The pattern ID corresponding to the timestamp format. + */ + [[nodiscard]] auto ingest_unknown_precision_epoch_timestamp( + std::string_view key, + int32_t node_id, + int64_t timestamp + ) -> std::pair; /** - * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and - * timestamp ranges makes no effort to convert second and nanosecond encoded timestamps into - * millisecond encoded timestamps. - * @return the beginning of this archive's time range as milliseconds since the UNIX epoch + * @return The beginning of this archive's time range as milliseconds since the UNIX epoch */ epochtime_t get_begin_timestamp() const; /** - * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and - * timestamp ranges makes no effort to convert second and nanosecond encoded timestamps into - * millisecond encoded timestamps. - * @return the end of this archive's time range as milliseconds since the UNIX epoch + * @return The end of this archive's time range as milliseconds since the UNIX epoch */ epochtime_t get_end_timestamp() const; @@ -87,29 +104,18 @@ class TimestampDictionaryWriter { void clear(); private: - /** - * Merges timestamp ranges with the same key name but different node ids. - */ - void merge_range(); - - /** - * Writes timestamp entries to a buffered stream. - * @param ranges - * @param compressor - */ - static void write_timestamp_entries( - std::map const& ranges, - std::stringstream& stream - ); - - using pattern_to_id_t = std::unordered_map; - // Variables - pattern_to_id_t m_pattern_to_id; + std::vector> + m_string_pattern_and_id_pairs; + absl::flat_hash_map> + m_numeric_pattern_to_id; uint64_t m_next_id{}; - std::map m_column_key_to_range; std::unordered_map m_column_id_to_range; + + std::string m_generated_pattern; + std::vector m_quoted_timestamp_patterns; + std::vector m_numeric_timestamp_patterns; }; } // namespace clp_s diff --git a/components/core/src/clp_s/TimestampEntry.cpp b/components/core/src/clp_s/TimestampEntry.cpp index 8a5ab65de5..cd0fda9fe8 100644 --- a/components/core/src/clp_s/TimestampEntry.cpp +++ b/components/core/src/clp_s/TimestampEntry.cpp @@ -9,55 +9,23 @@ using clp_s::search::ast::FilterOperation; namespace clp_s { -void TimestampEntry::ingest_timestamp(epochtime_t timestamp) { - if (m_encoding == DoubleEpoch) { - if (timestamp < std::ceil(m_epoch_start_double)) { - m_epoch_start_double = timestamp; - } - if (timestamp > std::floor(m_epoch_end_double)) { - m_epoch_end_double = timestamp; - } - - return; - } +namespace { +// Constants +constexpr epochtime_t cNanosecondsInMillisecond{1'000'000}; +} // namespace - if (m_encoding == UnkownTimestampEncoding) { - m_encoding = Epoch; - } - - if (timestamp < m_epoch_start) { - m_epoch_start = timestamp; - } - if (timestamp > m_epoch_end) { - m_epoch_end = timestamp; - } -} - -void TimestampEntry::ingest_timestamp(double timestamp) { - if (m_encoding == UnkownTimestampEncoding) { - m_encoding = DoubleEpoch; - } else if (m_encoding == Epoch) { - m_encoding = DoubleEpoch; - m_epoch_start_double = m_epoch_start; - m_epoch_end_double = m_epoch_end; - } - - if (timestamp < m_epoch_start_double) { - m_epoch_start_double = timestamp; - } - if (timestamp > m_epoch_end_double) { - m_epoch_end_double = timestamp; - } -} - -void TimestampEntry::merge_range(TimestampEntry const& entry) { - if (entry.m_encoding == Epoch) { - ingest_timestamp(entry.m_epoch_start); - ingest_timestamp(entry.m_epoch_end); - } else if (entry.m_encoding == DoubleEpoch) { - ingest_timestamp(entry.m_epoch_start_double); - ingest_timestamp(entry.m_epoch_end_double); - } +void TimestampEntry::ingest_timestamp(epochtime_t timestamp) { + m_encoding = Epoch; + auto const whole_milliseconds_in_timestamp{timestamp / cNanosecondsInMillisecond}; + auto const remainder_nanoseconds_in_timestamp{timestamp % cNanosecondsInMillisecond}; + auto const millisecond_timestamp_upper_bound{ + whole_milliseconds_in_timestamp + (remainder_nanoseconds_in_timestamp > 0 ? 1 : 0) + }; + auto const millisecond_timestamp_lower_bound{ + whole_milliseconds_in_timestamp - (remainder_nanoseconds_in_timestamp < 0 ? 1 : 0) + }; + m_epoch_start = std::min(m_epoch_start, millisecond_timestamp_lower_bound); + m_epoch_end = std::max(m_epoch_end, millisecond_timestamp_upper_bound); } void TimestampEntry::write_to_stream(std::stringstream& stream) const { diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp index 8bf851bb10..5686b62113 100644 --- a/components/core/src/clp_s/TimestampEntry.hpp +++ b/components/core/src/clp_s/TimestampEntry.hpp @@ -42,13 +42,14 @@ class TimestampEntry { m_epoch_start(cEpochTimeMax), m_epoch_end(cEpochTimeMin) {} - TimestampEntry(std::string_view key_name) + TimestampEntry(std::string_view key_name, int32_t node_id) : m_encoding(UnkownTimestampEncoding), m_epoch_start_double(cDoubleEpochTimeMax), m_epoch_end_double(cDoubleEpochTimeMin), m_epoch_start(cEpochTimeMax), m_epoch_end(cEpochTimeMin), - m_key_name(key_name) {} + m_key_name(key_name), + m_column_ids{node_id} {} /** * Ingest a timestamp potentially adjusting the start and end bounds for this @@ -56,13 +57,6 @@ class TimestampEntry { * @param timestamp the timestamp to be ingested */ void ingest_timestamp(epochtime_t timestamp); - void ingest_timestamp(double timestamp); - - /** - * Merge a timestamp range potentially adjusting the start and end bounds for this - * @param timestamp the timestamp to be ingested - */ - void merge_range(TimestampEntry const& entry); /** * Write the timestamp entry to a buffered stream. @@ -96,25 +90,17 @@ class TimestampEntry { std::unordered_set const& get_column_ids() const { return m_column_ids; } - void insert_column_id(int32_t column_id) { m_column_ids.insert(column_id); } - - void insert_column_ids(std::unordered_set const& column_ids) { - m_column_ids.insert(column_ids.begin(), column_ids.end()); - } - /** - * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and - * timestamp ranges makes no effort to convert second and nanosecond encoded timestamps into - * millisecond encoded timestamps. - * @return the beginning of the time range as milliseconds since the UNIX epoch + * NOTE: The returned timestamp is not guaranteed to be millisecond precision for archives + * older than 0.5.0. + * @return The beginning of the time range as milliseconds since the UNIX epoch. */ epochtime_t get_begin_timestamp() const; /** - * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and - * timestamp ranges makes no effort to convert second and nanosecond encoded timestamps into - * millisecond encoded timestamps. - * @return the end of the time range as milliseconds since the UNIX epoch + * NOTE: The returned timestamp is not guaranteed to be millisecond precision for archives + * older than 0.5.0. + * @return The end of the time range as milliseconds since the UNIX epoch */ epochtime_t get_end_timestamp() const; diff --git a/components/core/src/clp_s/TimestampPattern.cpp b/components/core/src/clp_s/TimestampPattern.cpp index e493ed72fe..84ca13d33b 100644 --- a/components/core/src/clp_s/TimestampPattern.cpp +++ b/components/core/src/clp_s/TimestampPattern.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -16,13 +15,8 @@ using clp::string_utils::convert_string_to_int; using std::string; using std::string_view; using std::to_string; -using std::vector; namespace clp_s { -// Static member default initialization -std::unique_ptr TimestampPattern::m_known_ts_patterns = nullptr; -size_t TimestampPattern::m_known_ts_patterns_len = 0; - // File-scope constants static constexpr int cNumDaysInWeek = 7; static char const* cAbbrevDaysOfWeek[cNumDaysInWeek] @@ -202,135 +196,6 @@ static bool convert_string_to_number_notz( return true; } -/* - * To initialize m_known_ts_patterns, we first create a vector of patterns then copy it to a - * dynamic array. This eases maintenance of the list and the cost doesn't matter since it is - * only done once when the program starts. - */ -void TimestampPattern::init() { - // Terminate if already initialized. - if (nullptr != m_known_ts_patterns) { - return; - } - // First create vector of observed patterns so that it's easy to maintain - vector patterns; - // E.g. 1706980946603 - patterns.emplace_back(0, "%E"); - // E.g. 1679711330.789032462 - patterns.emplace_back(0, "%F"); - - // E.g. 2022-04-06T03:33:23.476Z ...47, ...4 ...() - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S.%TZ"); - // E.g. 2022-04-06T03:33:23Z - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%SZ"); - // E.g. 2022-04-06 03:33:23.476Z ...47, ...4 ...() - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S.%TZ"); - // E.g. 2022-04-06 03:33:23Z - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%SZ"); - // E.g. 2022/04/06T03:33:23.476Z ...47, ...4 ...() - patterns.emplace_back(0, "%Y/%m/%dT%H:%M:%S.%TZ"); - // E.g. 2022/04/06T03:33:23Z - patterns.emplace_back(0, "%Y/%m/%dT%H:%M:%SZ"); - // E.g. 2022/04/06 03:33:23.476Z ...47, ...4 ...() - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S.%TZ"); - // E.g. 2022/04/06 03:33:23Z - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%SZ"); - - // E.g. 2015-01-31T15:50:45.392 - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S.%3"); - // E.g. 2015-01-31T15:50:45,392 - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S,%3"); - // E.g. 2015-01-31 15:50:45.392 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S.%3"); - // E.g. 2015-01-31 15:50:45,392 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S,%3"); - // E.g. 2015/01/31T15:50:45.123 - patterns.emplace_back(0, "%Y/%m/%dT%H:%M:%S.%3"); - // E.g. 2015/01/31T15:50:45,123 - patterns.emplace_back(0, "%Y/%m/%dT%H:%M:%S,%3"); - // E.g. 2015/01/31 15:50:45.123 - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S.%3"); - // E.g. 2015/01/31 15:50:45,123 - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S,%3"); - // E.g. [2015-01-31 15:50:45,085] - patterns.emplace_back(0, "[%Y-%m-%d %H:%M:%S,%3]"); - // E.g. INFO [main] 2015-01-31 15:50:45,085 - patterns.emplace_back(2, "%Y-%m-%d %H:%M:%S,%3"); - // E.g. <<<2016-11-10 03:02:29:936 - patterns.emplace_back(0, "<<<%Y-%m-%d %H:%M:%S:%3"); - // E.g. 01 Jan 2016 15:50:17,085 - patterns.emplace_back(0, "%d %b %Y %H:%M:%S,%3"); - // E.g. 2015-01-31T15:50:45 - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S"); - // E.g. 2015-01-31 15:50:45 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S"); - // E.g. 2015/01/31T15:50:45 - patterns.emplace_back(0, "%Y/%m/%dT%H:%M:%S"); - // E.g. 2015/01/31 15:50:45 - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S"); - // E.g. [2015-01-31T15:50:45 - patterns.emplace_back(0, "[%Y-%m-%dT%H:%M:%S"); - // E.g. [20170106-16:56:41] - patterns.emplace_back(0, "[%Y%m%d-%H:%M:%S]"); - // E.g. Start-Date: 2015-01-31 15:50:45 - patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); - // E.g. 15/01/31 15:50:45 - patterns.emplace_back(0, "%y/%m/%d %H:%M:%S"); - // E.g. 150131 9:50:45 - patterns.emplace_back(0, "%y%m%d %k:%M:%S"); - // E.g. Jan 01, 2016 3:50:17 PM - patterns.emplace_back(0, "%b %d, %Y %l:%M:%S %p"); - // E.g. January 31, 2015 15:50 - patterns.emplace_back(0, "%B %d, %Y %H:%M"); - // E.g. E [31/Jan/2015:15:50:45 - patterns.emplace_back(1, "[%d/%b/%Y:%H:%M:%S"); - // E.g. localhost - - [01/Jan/2016:15:50:17 - // E.g. 192.168.4.5 - - [01/Jan/2016:15:50:17 - patterns.emplace_back(3, "[%d/%b/%Y:%H:%M:%S"); - // E.g. 192.168.4.5 - - [01/01/2016:15:50:17 - patterns.emplace_back(3, "[%d/%m/%Y:%H:%M:%S"); - // E.g. Started POST "/api/v3/internal/allowed" for 127.0.0.1 at 2017-06-18 00:20:44 - patterns.emplace_back(6, "%Y-%m-%d %H:%M:%S"); - // E.g. update-alternatives 2015-01-31 15:50:45 - patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); - // E.g. ERROR: apport (pid 4557) Sun Jan 1 15:50:45 2015 - patterns.emplace_back(4, "%a %b %e %H:%M:%S %Y"); - // E.g. Sun Jan 1 15:50:45 2015 - patterns.emplace_back(0, "%a %b %e %H:%M:%S %Y"); - - // TODO These patterns are imprecise and will prevent searching by timestamp; but for now, - // it's no worse than not parsing a timestamp E.g. Jan 21 11:56:42 - patterns.emplace_back(0, "%b %d %H:%M:%S"); - // E.g. 01-21 11:56:42.392 - patterns.emplace_back(0, "%m-%d %H:%M:%S.%3"); - - // Initialize m_known_ts_patterns with vector's contents - m_known_ts_patterns_len = patterns.size(); - m_known_ts_patterns = std::make_unique(m_known_ts_patterns_len); - for (size_t i = 0; i < patterns.size(); ++i) { - m_known_ts_patterns[i] = patterns[i]; - } -} - -TimestampPattern const* TimestampPattern::search_known_ts_patterns( - string_view line, - epochtime_t& timestamp, - size_t& timestamp_begin_pos, - size_t& timestamp_end_pos -) { - for (size_t i = 0; i < m_known_ts_patterns_len; ++i) { - if (m_known_ts_patterns[i] - .parse_timestamp(line, timestamp, timestamp_begin_pos, timestamp_end_pos)) - { - return &m_known_ts_patterns[i]; - } - } - - timestamp_begin_pos = string::npos; - timestamp_end_pos = string::npos; - return nullptr; -} - string const& TimestampPattern::get_format() const { return m_format; } diff --git a/components/core/src/clp_s/TimestampPattern.hpp b/components/core/src/clp_s/TimestampPattern.hpp index 171090741e..48caff4a6c 100644 --- a/components/core/src/clp_s/TimestampPattern.hpp +++ b/components/core/src/clp_s/TimestampPattern.hpp @@ -70,30 +70,6 @@ class TimestampPattern { m_format(std::move(format)) {} // Methods - /** - * Initializes global data required by this class. This function must be called before any other - * methods of the class are used. - * - * Note: this function is NOT thread-safe. - */ - static void init(); - - /** - * Searches for a known timestamp pattern which can parse the timestamp from the given line, and - * if found, parses the timestamp - * @param line - * @param timestamp Parsed timestamp - * @param timestamp_begin_pos - * @param timestamp_end_pos - * @return pointer to the timestamp pattern if found, nullptr otherwise - */ - static TimestampPattern const* search_known_ts_patterns( - std::string_view line, - epochtime_t& timestamp, - size_t& timestamp_begin_pos, - size_t& timestamp_end_pos - ); - /** * Gets the timestamp pattern's format string * @return See description diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index c2cf7b6abd..87d2b0af2b 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -39,7 +39,7 @@ #include "search/OutputHandler.hpp" #include "search/Projection.hpp" #include "search/SchemaMatch.hpp" -#include "TimestampPattern.hpp" +#include "SingleFileArchiveDefs.hpp" using namespace clp_s::search; using clp_s::cArchiveFormatDevelopmentVersionFlag; @@ -183,10 +183,12 @@ bool search_archive( return true; } - ast::SetTimestampLiteralPrecision date_precision_pass{ - ast::TimestampLiteral::Precision::Milliseconds - }; - expr = date_precision_pass.run(expr); + if (archive_reader->has_deprecated_timestamp_format()) { + ast::SetTimestampLiteralPrecision date_precision_pass{ + ast::TimestampLiteral::Precision::Milliseconds + }; + expr = date_precision_pass.run(expr); + } // Narrow against schemas auto match_pass = std::make_shared( @@ -302,7 +304,6 @@ int main(int argc, char const* argv[]) { return 1; } - clp_s::TimestampPattern::init(); mongocxx::instance const mongocxx_instance{}; clp::CurlGlobalInstance const curl_instance{}; diff --git a/components/core/src/clp_s/indexer/CMakeLists.txt b/components/core/src/clp_s/indexer/CMakeLists.txt index 86a0a32056..8de47163c9 100644 --- a/components/core/src/clp_s/indexer/CMakeLists.txt +++ b/components/core/src/clp_s/indexer/CMakeLists.txt @@ -141,6 +141,7 @@ if(CLP_BUILD_EXECUTABLES) Boost::program_options Boost::url ${CURL_LIBRARIES} clp::string_utils + clp_s::timestamp_parser date::date MariaDBClient::MariaDBClient msgpack-cxx diff --git a/components/core/src/clp_s/indexer/IndexManager.cpp b/components/core/src/clp_s/indexer/IndexManager.cpp index 4b2882e29e..93d32e6e04 100644 --- a/components/core/src/clp_s/indexer/IndexManager.cpp +++ b/components/core/src/clp_s/indexer/IndexManager.cpp @@ -142,12 +142,7 @@ void IndexManager::traverse_schema_tree_and_update_metadata( path_buffer += escape_key_name(node.get_key_name()); if (children_ids.empty() && NodeType::Object != node_type && NodeType::Unknown != node_type) { - // Always index authoritative timestamp as `NodeType::DateString` - auto const indexed_node_type - = timestamp_dict->get_authoritative_timestamp_column_ids().contains(node_id) - ? NodeType::DateString - : node_type; - m_field_update_callback(path_buffer, indexed_node_type); + m_field_update_callback(path_buffer, node_type); } for (auto child_id : children_ids) { diff --git a/components/core/src/clp_s/search/QueryRunner.cpp b/components/core/src/clp_s/search/QueryRunner.cpp index 3fa6a2e6a5..783d25409f 100644 --- a/components/core/src/clp_s/search/QueryRunner.cpp +++ b/components/core/src/clp_s/search/QueryRunner.cpp @@ -63,7 +63,8 @@ auto QueryRunner::schema_init(int32_t schema_id) -> EvaluatedValue { void QueryRunner::clear_readers() { m_clp_string_readers.clear(); m_var_string_readers.clear(); - m_datestring_readers.clear(); + m_timestamp_readers.clear(); + m_deprecated_datestring_reader = nullptr; m_basic_readers.clear(); } @@ -73,16 +74,25 @@ void QueryRunner::initialize_reader(int32_t column_id, BaseColumnReader* column_ & node_to_literal_type(m_schema_tree->get_node(column_id).get_type()))) || m_match->schema_searches_against_column(m_schema, column_id)) { - auto* clp_reader = dynamic_cast(column_reader); - auto* var_reader = dynamic_cast(column_reader); - auto* date_reader = dynamic_cast(column_reader); - if (nullptr != clp_reader && clp_reader->get_type() == NodeType::ClpString) { + if (auto* const clp_reader = dynamic_cast(column_reader); + nullptr != clp_reader && NodeType::ClpString == clp_reader->get_type()) + { m_clp_string_readers[column_id].push_back(clp_reader); - } else if (nullptr != var_reader && var_reader->get_type() == NodeType::VarString) { + } else if (auto* const var_reader + = dynamic_cast(column_reader); + nullptr != var_reader) + { m_var_string_readers[column_id].push_back(var_reader); - } else if (nullptr != date_reader) { - // Datestring readers with a given column ID are guaranteed not to repeat - m_datestring_readers.emplace(column_id, date_reader); + } else if (auto* const timestamp_reader + = dynamic_cast(column_reader); + nullptr != timestamp_reader) + { + m_timestamp_readers.emplace(column_id, timestamp_reader); + } else if (auto* const deprecated_date_reader + = dynamic_cast(column_reader); + nullptr != deprecated_date_reader) + { + m_deprecated_datestring_reader = deprecated_date_reader; } else { m_basic_readers[column_id].push_back(column_reader); } @@ -228,13 +238,19 @@ bool QueryRunner::evaluate_wildcard_filter(FilterExpr* expr, int32_t schema) { } if (column->matches_type(LiteralType::TimestampT)) { - for (auto entry : m_datestring_readers) { - if (false == matches_metadata && m_metadata_columns.contains(entry.first)) { - continue; - } - if (evaluate_epoch_date_filter(op, entry.second, literal)) { + if (nullptr != m_deprecated_datestring_reader) { + if (evaluate_epoch_date_filter(op, m_deprecated_datestring_reader, literal)) { return true; } + } else { + for (auto entry : m_timestamp_readers) { + if (false == matches_metadata && m_metadata_columns.contains(entry.first)) { + continue; + } + if (evaluate_timestamp_filter(op, entry.second, literal)) { + return true; + } + } } } @@ -307,12 +323,22 @@ bool QueryRunner::evaluate_filter(FilterExpr* expr, int32_t schema) { get_cached_decompressed_unstructured_array(column_id), literal ); - case LiteralType::TimestampT: - return evaluate_epoch_date_filter( + case LiteralType::TimestampT: { + if (nullptr != m_deprecated_datestring_reader + && m_deprecated_datestring_reader->get_id() == column_id) + { + return evaluate_epoch_date_filter( + expr->get_operation(), + m_deprecated_datestring_reader, + literal + ); + } + return evaluate_timestamp_filter( expr->get_operation(), - m_datestring_readers[column_id], + m_timestamp_readers.at(column_id), literal ); + } // case LiteralType::NullT: // null checks are always turned into existence operators -- // no need to evaluate here @@ -947,7 +973,7 @@ void QueryRunner::populate_searched_wildcard_columns(std::shared_ptr auto literal_type = node_to_literal_type(tree_node_type); matching_types |= literal_type; if (NodeType::ClpString != tree_node_type && NodeType::VarString != tree_node_type - && NodeType::DateString != tree_node_type) + && NodeType::DeprecatedDateString != tree_node_type) { m_wildcard_to_searched_basic_columns[col].insert(node); } @@ -1162,7 +1188,7 @@ EvaluatedValue QueryRunner::constant_propagate(std::shared_ptr const bool QueryRunner::evaluate_epoch_date_filter( FilterOperation op, - DateStringColumnReader* reader, + DeprecatedDateStringColumnReader* reader, std::shared_ptr& operand ) { if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { @@ -1176,4 +1202,21 @@ bool QueryRunner::evaluate_epoch_date_filter( return evaluate_int_filter_core(op, reader->get_encoded_time(m_cur_message), op_value); } + +auto QueryRunner::evaluate_timestamp_filter( + ast::FilterOperation op, + TimestampColumnReader* reader, + std::shared_ptr& operand +) -> bool { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + int64_t op_value{}; + if (false == operand->as_int(op_value, op)) { + return false; + } + + return evaluate_int_filter_core(op, reader->get_encoded_time(m_cur_message), op_value); +} } // namespace clp_s::search diff --git a/components/core/src/clp_s/search/QueryRunner.hpp b/components/core/src/clp_s/search/QueryRunner.hpp index 2d1d1896da..b5a0c14533 100644 --- a/components/core/src/clp_s/search/QueryRunner.hpp +++ b/components/core/src/clp_s/search/QueryRunner.hpp @@ -133,7 +133,8 @@ class QueryRunner : public FilterClass { std::unordered_map*> m_expr_var_match_map; std::unordered_map> m_clp_string_readers; std::unordered_map> m_var_string_readers; - std::unordered_map m_datestring_readers; + std::unordered_map m_timestamp_readers; + DeprecatedDateStringColumnReader* m_deprecated_datestring_reader{nullptr}; std::unordered_map> m_basic_readers; std::unordered_map m_extracted_unstructured_arrays; uint64_t m_cur_message{0}; @@ -267,7 +268,20 @@ class QueryRunner : public FilterClass { */ auto evaluate_epoch_date_filter( ast::FilterOperation op, - DateStringColumnReader* reader, + DeprecatedDateStringColumnReader* reader, + std::shared_ptr& operand + ) -> bool; + + /** + * Evaluates a timestamp filter. + * @param op + * @param reader + * @param operand + * @return Whether the filter evaluates to true. + */ + auto evaluate_timestamp_filter( + ast::FilterOperation op, + TimestampColumnReader* reader, std::shared_ptr& operand ) -> bool; diff --git a/components/core/src/clp_s/timestamp_parser/TimestampParser.cpp b/components/core/src/clp_s/timestamp_parser/TimestampParser.cpp index ce0f0c7d20..37f2fe03d0 100644 --- a/components/core/src/clp_s/timestamp_parser/TimestampParser.cpp +++ b/components/core/src/clp_s/timestamp_parser/TimestampParser.cpp @@ -243,20 +243,6 @@ find_first_matching_prefix(std::string_view str, std::span epochtime_t; -/** - * Estimates the precision of an epoch timestamp based on its proximity to 1971 in different - * precisions. - * - * This heuristic works because one year in epoch nanoseconds is approximately 1000 years in epoch - * microseconds, and so on. Note that this heuristic can not distinguish the precision of timestamps - * with absolute value sufficiently close to zero. - * - * @param timestamp - * @return A pair containing the scaling factor needed to convert the timestamp into nanosecond - * precision, and a format specifier indicating the precision of the timestamp. - */ -[[nodiscard]] auto estimate_timestamp_precision(int64_t timestamp) -> std::pair; - /** * Marshals a date-time timestamp according to a timestamp pattern. * @param timestamp @@ -541,30 +527,6 @@ auto extract_bracket_pattern_list(std::string_view str) return entry_offsets_and_sizes; } -auto estimate_timestamp_precision(int64_t timestamp) -> std::pair { - auto const abs_timestamp = timestamp < 0 ? -timestamp : timestamp; - if (abs_timestamp > cEpochNanoseconds1971) { - return std::make_pair(1LL, 'N'); - } - if (abs_timestamp > cEpochMicroseconds1971) { - constexpr auto cFactor{cPowersOfTen - [cNumNanosecondPrecisionSubsecondDigits - - cNumMicrosecondPrecisionSubsecondDigits]}; - return std::make_pair(cFactor, 'C'); - } - if (abs_timestamp > cEpochMilliseconds1971) { - constexpr auto cFactor{cPowersOfTen - [cNumNanosecondPrecisionSubsecondDigits - - cNumMillisecondPrecisionSubsecondDigits]}; - return std::make_pair(cFactor, 'L'); - } - constexpr auto cFactor{ - cPowersOfTen - [cNumNanosecondPrecisionSubsecondDigits - cNumSecondPrecisionSubsecondDigits] - }; - return std::make_pair(cFactor, 'E'); -} - auto extract_absolute_subsecond_nanoseconds(epochtime_t timestamp) -> epochtime_t { constexpr auto cFactor{ cPowersOfTen @@ -1856,6 +1818,30 @@ auto marshal_timestamp(epochtime_t timestamp, TimestampPattern const& pattern, s return std::nullopt; } +auto estimate_timestamp_precision(int64_t timestamp) -> std::pair { + auto const abs_timestamp = timestamp < 0 ? -timestamp : timestamp; + if (abs_timestamp > cEpochNanoseconds1971) { + return std::make_pair(1LL, 'N'); + } + if (abs_timestamp > cEpochMicroseconds1971) { + constexpr auto cFactor{cPowersOfTen + [cNumNanosecondPrecisionSubsecondDigits + - cNumMicrosecondPrecisionSubsecondDigits]}; + return std::make_pair(cFactor, 'C'); + } + if (abs_timestamp > cEpochMilliseconds1971) { + constexpr auto cFactor{cPowersOfTen + [cNumNanosecondPrecisionSubsecondDigits + - cNumMillisecondPrecisionSubsecondDigits]}; + return std::make_pair(cFactor, 'L'); + } + constexpr auto cFactor{ + cPowersOfTen + [cNumNanosecondPrecisionSubsecondDigits - cNumSecondPrecisionSubsecondDigits] + }; + return std::make_pair(cFactor, 'E'); +} + auto get_default_date_time_timestamp_patterns() -> ystdlib::error_handling::Result> { std::vector timestamp_patterns; diff --git a/components/core/src/clp_s/timestamp_parser/TimestampParser.hpp b/components/core/src/clp_s/timestamp_parser/TimestampParser.hpp index 7c5128bc9c..859987a6f4 100644 --- a/components/core/src/clp_s/timestamp_parser/TimestampParser.hpp +++ b/components/core/src/clp_s/timestamp_parser/TimestampParser.hpp @@ -274,6 +274,20 @@ marshal_timestamp(epochtime_t timestamp, TimestampPattern const& pattern, std::s std::string& generated_pattern ) -> std::optional>; +/** + * Estimates the precision of an epoch timestamp based on its proximity to 1971 in different + * precisions. + * + * This heuristic works because one year in epoch nanoseconds is approximately 1000 years in epoch + * microseconds, and so on. Note that this heuristic can not distinguish the precision of timestamps + * with absolute value sufficiently close to zero. + * + * @param timestamp + * @return A pair containing the scaling factor needed to convert the timestamp into nanosecond + * precision, and a format specifier indicating the precision of the timestamp. + */ +[[nodiscard]] auto estimate_timestamp_precision(int64_t timestamp) -> std::pair; + /** * @return A result containing a vector of date-time timestamp patterns, or an error code indicating * the failure: diff --git a/components/core/tests/clp_s_test_utils.cpp b/components/core/tests/clp_s_test_utils.cpp index d6da23b8f0..3a1da2000a 100644 --- a/components/core/tests/clp_s_test_utils.cpp +++ b/components/core/tests/clp_s_test_utils.cpp @@ -46,7 +46,6 @@ auto compress_archive( parser_option.timestamp_key = std::move(timestamp_key.value()); } - clp_s::TimestampPattern::init(); clp_s::JsonParser parser{parser_option}; std::vector archive_stats; REQUIRE(parser.ingest()); diff --git a/components/core/tests/test-clp_s-end_to_end.cpp b/components/core/tests/test-clp_s-end_to_end.cpp index 85bedde75e..8991d81643 100644 --- a/components/core/tests/test-clp_s-end_to_end.cpp +++ b/components/core/tests/test-clp_s-end_to_end.cpp @@ -37,6 +37,7 @@ constexpr std::string_view cTestEndToEndValidFormattedFloatInputFile{ constexpr std::string_view cTestEndToEndInvalidFormattedFloatInputFile{ "test_invalid_formatted_float.jsonl" }; +constexpr std::string_view cTestEndToEndTimestampInputFile{"test_timestamp.jsonl"}; namespace { auto get_test_input_path_relative_to_tests_dir(std::string_view const test_input_path) @@ -329,3 +330,36 @@ TEST_CASE("clp-s-compress-extract-invalid-formatted-floats", "[clp-s][end-to-end extracted_json_path ); } + +TEST_CASE("clp-s-compress-extract-timestamps", "[clp-s][end-to-end]") { + constexpr std::string_view cTimestampColumn{"timestamp"}; + auto single_file_archive = GENERATE(true, false); + + TestOutputCleaner const test_cleanup{ + {std::string{cTestEndToEndArchiveDirectory}, + std::string{cTestEndToEndOutputDirectory}, + std::string{cTestEndToEndOutputSortedJson}, + std::string{cTestEndToEndExpectedOutputSortedFile}} + }; + + REQUIRE_NOTHROW( + std::ignore = compress_archive( + get_test_input_local_path(cTestEndToEndTimestampInputFile), + std::string{cTestEndToEndArchiveDirectory}, + std::string{cTimestampColumn}, + true, + single_file_archive, + false + ) + ); + validate_archive_header(); + + std::set const expected_matching_types{clp_s::NodeType::Timestamp}; + check_all_leaf_nodes_match_types(expected_matching_types); + + auto extracted_json_path = extract(); + literallyCompare( + get_test_input_local_path(cTestEndToEndTimestampInputFile), + extracted_json_path + ); +} diff --git a/components/core/tests/test-clp_s-search.cpp b/components/core/tests/test-clp_s-search.cpp index 526d2d45a8..1bc3033cd6 100644 --- a/components/core/tests/test-clp_s-search.cpp +++ b/components/core/tests/test-clp_s-search.cpp @@ -224,7 +224,7 @@ TEST_CASE("clp-s-search", "[clp-s][search]") { {R"aa(idx: 0 AND NOT $_filename: "clp string")aa", {0}}, {R"aa(idx: 0 AND NOT $*._filename.*: "clp string")aa", {0}}, {R"aa(($_filename: file OR $_file_split_number: 1 OR $_archive_creator_id > 0) AND )aa" - R"aa(idx: 0 OR idx: 1)aa", + R"aa(idx: 0 OR idx: timestamp("1"))aa", {1}}, {R"aa(ambiguous_varstring: "a*e")aa", {10, 11, 12}}, {R"aa(ambiguous_varstring: "a\*e")aa", {12}}, @@ -296,10 +296,12 @@ TEST_CASE("clp-s-search-formatted-float", "[clp-s][search]") { TEST_CASE("clp-s-search-float-timestamp", "[clp-s][search]") { std::vector>> queries_and_results{ - {R"aa(timestamp < 1759417024.4)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417023.1)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417024)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417024.1 AND timestamp < 1759417024.3)aa", {1}}, + {R"aa(timestamp < timestamp("1759417024.4"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417023.1"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417024"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417024.1") AND )aa" + R"aa(timestamp < timestamp("1759417024.3"))aa", + {1}} }; auto single_file_archive = GENERATE(true, false); auto retain_float_format = GENERATE(true, false); @@ -325,11 +327,13 @@ TEST_CASE("clp-s-search-float-timestamp", "[clp-s][search]") { TEST_CASE("clp-s-search-epoch-timestamp", "[clp-s][search]") { std::vector>> queries_and_results{ - {R"aa(timestamp < 1759417024400)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417023100)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417024000)aa", {0, 1, 2}}, - {R"aa(timestamp > 1759417024100 AND timestamp < 1759417024300)aa", {1}}, - {R"aa(timestamp > 1759417024299.9)aa", {2}} + {R"aa(timestamp < timestamp("1759417024400"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417023100"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417024000"))aa", {0, 1, 2}}, + {R"aa(timestamp > timestamp("1759417024100") AND )aa" + R"aa(timestamp < timestamp("1759417024300"))aa", + {1}}, + {R"aa(timestamp > timestamp("1759417024.299"))aa", {2}} }; auto single_file_archive = GENERATE(true, false); diff --git a/components/core/tests/test_log_files/test_timestamp.jsonl b/components/core/tests/test_log_files/test_timestamp.jsonl new file mode 100644 index 0000000000..d1a94b33f0 --- /dev/null +++ b/components/core/tests/test_log_files/test_timestamp.jsonl @@ -0,0 +1,5 @@ +{"timestamp" : "2026-01-01T12:34:56.789 UTC-05"} +{"timestamp" : "2026-01-01T12:34:56.789101112 UTC+02:30"} +{"timestamp" : "2015678901234"} +{"timestamp" : 2015678901234999} +{"timestamp" : 2015678901.234999123} diff --git a/components/webui/client/src/pages/SearchPage/SearchState/Presto/useTimestampKeyInit/sql.ts b/components/webui/client/src/pages/SearchPage/SearchState/Presto/useTimestampKeyInit/sql.ts index 244fe877f7..92d7a2612b 100644 --- a/components/webui/client/src/pages/SearchPage/SearchState/Presto/useTimestampKeyInit/sql.ts +++ b/components/webui/client/src/pages/SearchPage/SearchState/Presto/useTimestampKeyInit/sql.ts @@ -13,10 +13,11 @@ enum CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES { } /** - * Matching the `NodeType::DateString` value in + * Matching the `NodeType::DeprecatedDateString` and `NodeType::Timestamp` values in * `clp/components/core/src/clp_s/SchemaTree.hpp`. */ -const TIMESTAMP_TYPE = 8; +const DEPRECATED_TIMESTAMP_TYPE = 8; +const TIMESTAMP_TYPE = 14; interface TimestampColumnItem { [CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES.NAME]: string; @@ -32,7 +33,8 @@ const buildTimestampColumnsSql = (datasetName: string): string => ` SELECT DISTINCT ${CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES.NAME} FROM ${settings.SqlDbClpTablePrefix}${datasetName}_${SqlTableSuffix.COLUMN_METADATA} - WHERE ${CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES.TYPE} = ${TIMESTAMP_TYPE} + WHERE ${CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES.TYPE} IN + (${TIMESTAMP_TYPE}, ${DEPRECATED_TIMESTAMP_TYPE}) ORDER BY ${CLP_COLUMN_METADATA_TABLE_COLUMN_NAMES.NAME}; `;