-
Notifications
You must be signed in to change notification settings - Fork 3
feat: Add support for searching and marshalling the new Timestamp column type.
#51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b108580
1671230
4473c26
46dedae
31a5253
d81779b
45d166a
57ee741
fd51806
aaf5d96
460a88f
c78f992
91ecd2f
19c7b17
aa083a4
59af129
845a75e
3b7153f
1244c47
646d7b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -494,8 +494,8 @@ else() | |
| endif() | ||
| velox_resolve_dependency(glog) | ||
|
|
||
| velox_set_source(fmt) | ||
| velox_resolve_dependency(fmt 9.0.0) | ||
| set(fmt_SOURCE BUNDLED) | ||
| velox_resolve_dependency(fmt 11.2.0) | ||
|
Comment on lines
+497
to
+498
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Check which fmt APIs are used across the codebase to assess breakage risk from 9→11
echo "=== fmt includes ==="
rg -n '#include.*fmt' --type=cpp --type=h -g '!third_party' -g '!_build' | head -60
echo ""
echo "=== Usage of deprecated fmt APIs (ostream.h, printf.h, format_int, BasicFormatter, BasicStringRef) ==="
rg -n 'fmt/ostream\.h|fmt/printf\.h|fmt::format_int|fmt::BasicFormatter|fmt::BasicStringRef' --type=cpp --type=h -g '!third_party' -g '!_build'
echo ""
echo "=== Check spdlog dependency handling in CMakeLists.txt ==="
rg -B3 -A3 'spdlog' CMakeLists.txt | head -40Repository: y-scope/velox Length of output: 4587 🏁 Script executed: #!/bin/bash
# Check the context around lines 497-498 in CMakeLists.txt
echo "=== Context around fmt dependency (lines 490-510) ==="
sed -n '490,510p' CMakeLists.txt
echo ""
echo "=== Check if fmt_SOURCE is set conditionally elsewhere ==="
rg -n 'fmt_SOURCE' CMakeLists.txt
echo ""
echo "=== Check CLP connector block boundaries ==="
rg -B5 -A5 'VELOX_ENABLE_CLP_CONNECTOR' CMakeLists.txt | head -80Repository: y-scope/velox Length of output: 1985 🌐 Web query:
💡 Result: ABI / binary compatibility (shared library)
Source compatibility: notable breaking / behavior changes from 9.0.0 → 11.2.0fmt 9.0.0 (big one if you relied on ostream fallback)
fmt 10.0.0
fmt 11.0.0
fmt 11.2.0
Citations:
Critical: fmt version bump to 11.2.0 will break code using The codebase includes
The change is unconditional (not restricted to the CLP connector block), so it affects all build configurations. Before merging, either:
Additionally, fmt 11.2.0 requires binary rebuild of all dependents (incompatible with libfmt.so.9). 🤖 Prompt for AI Agents |
||
|
|
||
| if(VELOX_ENABLE_COMPRESSION_LZ4) | ||
| find_package(lz4 REQUIRED) | ||
|
|
@@ -541,12 +541,12 @@ if(${VELOX_ENABLE_CLP_CONNECTOR}) | |
| velox_resolve_dependency(spdlog) | ||
|
|
||
| # Dependencies that depend on other dependencies | ||
| velox_set_source(log_surgeon) | ||
| velox_resolve_dependency(log_surgeon) | ||
|
|
||
| velox_set_source(ystdlib) | ||
| velox_resolve_dependency(ystdlib) | ||
|
|
||
| velox_set_source(log_surgeon) | ||
| velox_resolve_dependency(log_surgeon) | ||
|
|
||
| set(clp_SOURCE BUNDLED) | ||
| velox_resolve_dependency(clp) | ||
| endif() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
|
|
||
| #pragma once | ||
|
|
||
| #include "clp_s/Defs.hpp" | ||
| #include "velox/type/Timestamp.h" | ||
|
|
||
| namespace facebook::velox::connector::clp::search_lib { | ||
|
|
@@ -121,4 +122,19 @@ inline auto convertToVeloxTimestamp(int64_t timestamp) -> Timestamp { | |
| return Timestamp(seconds, static_cast<uint64_t>(nanoseconds)); | ||
| } | ||
|
|
||
| /// Converts a nanosecond precision epochtime_t into a Velox timestamp. | ||
| /// | ||
| /// @param timestamp the input timestamp as an integer | ||
| /// @return the corresponding Velox timestamp | ||
| inline auto convertNanosecondEpochToVeloxTimestamp(clp_s::epochtime_t timestamp) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this called anywhere? This looks like the handler I am looking for, but probably need to call it in the VectorLoader if I understand correctly. |
||
| -> Timestamp { | ||
| int64_t seconds{timestamp / Timestamp::kNanosInSecond}; | ||
| int64_t nanoseconds{timestamp % Timestamp::kNanosInSecond}; | ||
| if (nanoseconds < 0) { | ||
| seconds -= 1; | ||
| nanoseconds += Timestamp::kNanosInSecond; | ||
| } | ||
| return Timestamp(seconds, static_cast<uint64_t>(nanoseconds)); | ||
| } | ||
|
|
||
| } // namespace facebook::velox::connector::clp::search_lib | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -17,9 +17,12 @@ | |||||
| #include <glog/logging.h> | ||||||
|
|
||||||
| #include "clp_s/ArchiveReader.hpp" | ||||||
| #include "clp_s/SingleFileArchiveDefs.hpp" | ||||||
| #include "clp_s/search/EvaluateTimestampIndex.hpp" | ||||||
| #include "clp_s/search/ast/EmptyExpr.hpp" | ||||||
| #include "clp_s/search/ast/SearchUtils.hpp" | ||||||
| #include "clp_s/search/ast/SetTimestampLiteralPrecision.hpp" | ||||||
| #include "clp_s/search/ast/TimestampLiteral.hpp" | ||||||
| #include "velox/connectors/clp/ClpColumnHandle.h" | ||||||
| #include "velox/connectors/clp/search_lib/archive/ClpArchiveCursor.h" | ||||||
| #include "velox/connectors/clp/search_lib/archive/ClpArchiveJsonStringVectorLoader.h" | ||||||
|
|
@@ -128,14 +131,23 @@ ErrorCode ClpArchiveCursor::loadSplit() { | |||||
| archiveReader_->open( | ||||||
| get_path_object_for_raw_path(splitPath_), networkAuthOption); | ||||||
| } catch (std::exception& e) { | ||||||
| VLOG(2) << "Failed to open archive file: " << e.what(); | ||||||
| VLOG(2) << "Failed to open archive file: " << splitPath_ << ": " | ||||||
| << e.what(); | ||||||
| return ErrorCode::InternalError; | ||||||
| } | ||||||
|
|
||||||
| auto timestampDict = archiveReader_->get_timestamp_dictionary(); | ||||||
| auto schemaTree = archiveReader_->get_schema_tree(); | ||||||
| auto schemaMap = archiveReader_->get_schema_map(); | ||||||
|
|
||||||
| auto const defaultTimestampPrecision{ | ||||||
| archiveReader_->get_header().version < clp_s::cNewTimestampFormatVersion | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| ? TimestampLiteral::Precision::Milliseconds | ||||||
| : TimestampLiteral::Precision::Nanoseconds}; | ||||||
| SetTimestampLiteralPrecision timestampPrecisionPass{ | ||||||
| defaultTimestampPrecision}; | ||||||
| expr_ = timestampPrecisionPass.run(expr_); | ||||||
|
|
||||||
| EvaluateTimestampIndex timestampIndex(timestampDict); | ||||||
| if (clp_s::EvaluatedValue::False == timestampIndex.run(expr_)) { | ||||||
| VLOG(2) << "No matching timestamp ranges for query '" << query_ << "'"; | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,7 +72,7 @@ void ClpArchiveVectorLoader::populateTimestampData( | |
| case clp_s::NodeType::FormattedFloat: | ||
| case clp_s::NodeType::DictionaryFloat: | ||
| case clp_s::NodeType::Integer: | ||
| case clp_s::NodeType::DateString: | ||
| case clp_s::NodeType::DeprecatedDateString: | ||
| supportedNodeType = true; | ||
| break; | ||
| default: | ||
|
|
@@ -115,7 +115,8 @@ void ClpArchiveVectorLoader::populateTimestampData( | |
| convertToVeloxTimestamp( | ||
| std::get<int64_t>(reader->extract_value(messageIndex)))); | ||
| } else { | ||
| auto reader = static_cast<clp_s::DateStringColumnReader*>(columnReader_); | ||
| auto reader = | ||
| static_cast<clp_s::DeprecatedDateStringColumnReader*>(columnReader_); | ||
| vector->set( | ||
| vectorIndex, | ||
| convertToVeloxTimestamp(reader->get_encoded_time(messageIndex))); | ||
|
|
@@ -214,8 +215,9 @@ void ClpArchiveVectorLoader::loadInternal( | |
| populateTimestampData<clp_s::NodeType::Integer>(rows, timestampVector); | ||
| } else if ( | ||
| nullptr != | ||
| dynamic_cast<clp_s::DateStringColumnReader*>(columnReader_)) { | ||
| populateTimestampData<clp_s::NodeType::DateString>( | ||
| dynamic_cast<clp_s::DeprecatedDateStringColumnReader*>( | ||
| columnReader_)) { | ||
| populateTimestampData<clp_s::NodeType::DeprecatedDateString>( | ||
| rows, timestampVector); | ||
| } else if ( | ||
| nullptr != dynamic_cast<clp_s::FloatColumnReader*>(columnReader_)) { | ||
|
|
@@ -261,13 +263,21 @@ template void | |
| ClpArchiveVectorLoader::populateTimestampData<clp_s::NodeType::Integer>( | ||
| RowSet rows, | ||
| FlatVector<facebook::velox::Timestamp>* vector); | ||
| template void | ||
| ClpArchiveVectorLoader::populateTimestampData<clp_s::NodeType::DateString>( | ||
| template void ClpArchiveVectorLoader::populateTimestampData< | ||
| clp_s::NodeType::DeprecatedDateString>( | ||
| RowSet rows, | ||
| FlatVector<facebook::velox::Timestamp>* vector); | ||
| template void | ||
| ClpArchiveVectorLoader::populateTimestampData<clp_s::NodeType::Unknown>( | ||
| RowSet rows, | ||
| FlatVector<facebook::velox::Timestamp>* vector); | ||
| template void | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the changes make sense for I imagine there must be some handler for Timestamp right? |
||
| ClpArchiveVectorLoader::populateTimestampData<clp_s::NodeType::FormattedFloat>( | ||
| RowSet rows, | ||
| FlatVector<facebook::velox::Timestamp>* vector); | ||
| template void | ||
| ClpArchiveVectorLoader::populateTimestampData<clp_s::NodeType::DictionaryFloat>( | ||
| RowSet rows, | ||
| FlatVector<facebook::velox::Timestamp>* vector); | ||
|
|
||
| } // namespace facebook::velox::connector::clp::search_lib | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -576,8 +576,8 @@ TEST_F(ClpConnectorTest, test4IrTimestampNoPushdown) { | |
| TEST_F(ClpConnectorTest, test4IrTimestampPushdown) { | ||
| // Only the second event meet the condition, the first event is a date string | ||
| // which is not supported yet so the value will be NULL. | ||
| const std::shared_ptr<std::string> kqlQuery = | ||
| std::make_shared<std::string>("(timestamp < 1756003005000000)"); | ||
| const std::shared_ptr<std::string> kqlQuery = std::make_shared<std::string>( | ||
| R"(timestamp < timestamp("1756003005000000", "\L"))"); | ||
|
Comment on lines
+579
to
+580
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: In CrowdStrike LogScale / Humio “KQL” time formatting, the milliseconds component is the
Example (milliseconds shown as Note: In LogScale, the time modifiers like Sources Citations: 🏁 Script executed: # First, let's locate and examine the test file mentioned
fd ClpConnectorTest.cppRepository: y-scope/velox Length of output: 104 🏁 Script executed: # Check the exact content at lines 579-580 and 659-660
if fd ClpConnectorTest.cpp; then
echo "=== Lines 579-580 ===" && \
sed -n '575,585p' $(fd ClpConnectorTest.cpp)
echo -e "\n=== Lines 659-660 ===" && \
sed -n '655,665p' $(fd ClpConnectorTest.cpp)
fiRepository: y-scope/velox Length of output: 1262 🌐 Web query:
💡 Result: In YScope/CLP, the “timestamp formatting” you’re probably looking for is the YScope Log Viewer format-string
If instead you meant CLP’s KQL-like search (e.g., Sources: [1] [2] [3] [4] 🌐 Web query:
💡 Result: In CLP’s KQL (used by
Note: CLP docs also state time zone information is not preserved, so avoid relying on timezone offsets in stored/queried timestamps. [1] Source: CLP 🏁 Script executed: # Search for CLP-related documentation or comments in the codebase
rg -i "kql.*timestamp|format.*specifier|\\\L" velox/connectors/clp/ -A 3 -B 3 | head -100Repository: y-scope/velox Length of output: 2215 🏁 Script executed: # Look for any CLP documentation or README
fd -e md -e txt -e rst velox/connectors/clp/ | xargs cat 2>/dev/null | head -200Repository: y-scope/velox Length of output: 446 🏁 Script executed: # Search for references to "\L" format or timestamp parsing logic in CLP connector
rg "\\\\L|format.*timestamp|timestamp.*precision" velox/connectors/clp/ -i | head -50Repository: y-scope/velox Length of output: 3517 🏁 Script executed: # Search for SetTimestampLiteralPrecision and how it handles format specifiers
rg "SetTimestampLiteralPrecision|TimestampLiteral.*Precision" velox/connectors/clp/ -B 5 -A 10Repository: y-scope/velox Length of output: 6629 🏁 Script executed: # Look for handling of "\L" format specifier or how timestamp format is parsed
rg -i "format.*\\\\L|\\\\L.*format|timestamp.*format" velox/connectors/clp/ -B 3 -A 3Repository: y-scope/velox Length of output: 2943 🏁 Script executed: # Search for estimatePrecision usage and implementation
rg "estimatePrecision" velox/connectors/clp/ -B 5 -A 10Repository: y-scope/velox Length of output: 4583 🏁 Script executed: # Check if there's any logic that treats "\L" as a special case
rg "\\\\L|format.*specifier" velox/connectors/clp/search_lib/ -B 2 -A 2Repository: y-scope/velox Length of output: 39 🏁 Script executed: # Look at the test file to see if there are any assertions or comments about expected behaviour
sed -n '575,610p' $(fd ClpConnectorTest.cpp)Repository: y-scope/velox Length of output: 1522 🏁 Script executed: # Check the full test5 content to see the complete query and assertion
sed -n '655,690p' $(fd ClpConnectorTest.cpp)Repository: y-scope/velox Length of output: 1707 🏁 Script executed: # Search for any comments or documentation about format specifiers in test file
rg -i "format|specifier|precision|\\\\L" $(fd ClpConnectorTest.cpp) -B 2 -A 2 | head -80Repository: y-scope/velox Length of output: 2202 🏁 Script executed: # Check ClpTimestampsUtils.h in full to understand the thresholds better
cat velox/connectors/clp/search_lib/ClpTimestampsUtils.h | head -100Repository: y-scope/velox Length of output: 3740 🏁 Script executed: # Search for how SetTimestampLiteralPrecision works - does it force or suggest precision
rg -i "SetTimestampLiteralPrecision" --type cpp -A 5 -B 2 | head -60Repository: y-scope/velox Length of output: 3387 🏁 Script executed: # Check if there's a comments in code about why test4 might work despite the large value
sed -n '568,620p' $(fd ClpConnectorTest.cpp)Repository: y-scope/velox Length of output: 2154 🏁 Script executed: # Look for what kTestTimestampSeconds is set to
rg "kTestTimestampSeconds|kTestTimestampNanoseconds" $(fd ClpConnectorTest.cpp) -B 2 -A 1Repository: y-scope/velox Length of output: 1726 Fix test4's timestamp precision to match test5's format and CLP IR millisecond specification. The test4 timestamp query uses Given that CLP IR explicitly treats timestamp literals as millisecond precision (via test4 currently passes only because the archive contains sparse data with a single event well below any reasonable threshold. The 16-digit value either gets misinterpreted as nanoseconds by auto-detection (exceeding the 31.5 trillion nanosecond threshold) or happens to work due to the lax filter boundary. This becomes a latent bug in production when timestamp comparisons matter with edge-case or boundary values. 🔍 Suggested fix- R"(timestamp < timestamp("1756003005000000", "\L"))");
+ R"(timestamp < timestamp("1756003005000", "\L"))");Also apply to line 659 if using similar inconsistent 16-digit values (verify test5 uses only 13-digit format). 🤖 Prompt for AI Agents |
||
| auto plan = PlanBuilder(pool_.get()) | ||
| .startTableScan() | ||
| .outputType(ROW({"timestamp"}, {TIMESTAMP()})) | ||
|
|
@@ -655,11 +655,9 @@ TEST_F(ClpConnectorTest, test5FloatTimestampNoPushdown) { | |
|
|
||
| TEST_F(ClpConnectorTest, test5FloatTimestampPushdown) { | ||
| // Test filtering rows with a timestamp parsed from a date string and floats | ||
| // in various formats. Because KQL doesn’t automatically interpret the unit of | ||
| // the timestamp, the returned result differs slightly from the one without | ||
| // pushdown. | ||
| // in various formats. | ||
| const std::shared_ptr<std::string> kqlQuery = std::make_shared<std::string>( | ||
| "(timestamp < 1746003005.127 and timestamp >= 1746003005.124)"); | ||
| R"(timestamp < timestamp("1746003070000", "\L") and timestamp >= timestamp("1746003005124", "\L")"); | ||
| auto plan = | ||
| PlanBuilder(pool_.get()) | ||
| .startTableScan() | ||
|
|
@@ -688,12 +686,66 @@ TEST_F(ClpConnectorTest, test5FloatTimestampPushdown) { | |
| {Timestamp(1746003005, 124000000), | ||
| Timestamp(1746003005, 124100000), | ||
| Timestamp(1746003005, 125000000), | ||
| Timestamp(1746003005, 126000000)}), | ||
| Timestamp(1746003005, 126000000), | ||
| Timestamp(1746003005, 127000000), | ||
| Timestamp(1746003060, 0), | ||
| Timestamp(1746003065, 0)}), | ||
| makeFlatVector<double>( | ||
| {1.234567891234500E9, | ||
| {1.2345678912345E9, | ||
| 1E16, | ||
| 1.234567891234567E9, | ||
| 1.234567891234567E9})}); | ||
| 1.234567891234567E9, | ||
| -1.234567891234567E-9, | ||
| 1234567891.234567, | ||
| -1234567891.234567})}); | ||
| test::assertEqualVectors(expected, output); | ||
| } | ||
|
|
||
| TEST_F(ClpConnectorTest, test5v050FloatTimestampPushdown) { | ||
| // Test filtering rows with a timestamp parsed from a date string and floats | ||
| // in various formats. | ||
| const std::shared_ptr<std::string> kqlQuery = std::make_shared<std::string>( | ||
| R"(timestamp < timestamp("1746003070000", "\L") and timestamp >= timestamp("1746003005124", "\L")"); | ||
| auto plan = | ||
| PlanBuilder(pool_.get()) | ||
| .startTableScan() | ||
| .outputType(ROW({"timestamp", "floatValue"}, {TIMESTAMP(), DOUBLE()})) | ||
| .tableHandle( | ||
| std::make_shared<ClpTableHandle>(kClpConnectorId, "test_5")) | ||
| .assignments( | ||
| {{"timestamp", | ||
| std::make_shared<ClpColumnHandle>( | ||
| "timestamp", "timestamp", TIMESTAMP())}, | ||
| {"floatValue", | ||
| std::make_shared<ClpColumnHandle>( | ||
| "floatValue", "floatValue", DOUBLE())}}) | ||
| .endTableScan() | ||
| .orderBy({"\"timestamp\" ASC"}, false) | ||
| .planNode(); | ||
|
|
||
| auto output = getResults( | ||
| plan, | ||
| {makeClpSplit( | ||
| getExampleFilePath("test_5.v0.5.0.clps"), | ||
| ClpConnectorSplit::SplitType::kArchive, | ||
| kqlQuery)}); | ||
| auto expected = makeRowVector({// timestamp | ||
| makeFlatVector<Timestamp>( | ||
| {Timestamp(1746003005, 124000000), | ||
| Timestamp(1746003005, 124100000), | ||
| Timestamp(1746003005, 125000000), | ||
| Timestamp(1746003005, 126000000), | ||
| Timestamp(1746003005, 127000000), | ||
| Timestamp(1746003060, 0), | ||
| Timestamp(1746003065, 0)}), | ||
| makeFlatVector<double>( | ||
| {1.2345678912345E9, | ||
| 1E16, | ||
| 1.234567891234567E9, | ||
| 1.234567891234567E9, | ||
| -1.234567891234567E-9, | ||
| 1234567891.234567, | ||
| -1234567891.234567})}); | ||
| test::assertEqualVectors(expected, output); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧹 Nitpick | 🔵 Trivial
🧩 Analysis chain
🏁 Script executed:
Repository: y-scope/velox
Length of output: 159
🏁 Script executed:
# Read the clp.cmake file to see full context cat -n CMake/resolve_dependency_modules/clp.cmakeRepository: y-scope/velox
Length of output: 1729
🏁 Script executed:
Repository: y-scope/velox
Length of output: 235
🏁 Script executed:
Repository: y-scope/velox
Length of output: 5337
🏁 Script executed:
Repository: y-scope/velox
Length of output: 17902
Add a comment documenting when this CLP commit pin can be replaced with a tagged release.
The CLP dependency is correctly pinned to commit
f82e6114160a6addd4727259906bcf621ac9912cbecause the required APIs (SetTimestampLiteralPrecision,TimestampLiteral,SingleFileArchiveDefs,cNewTimestampFormatVersion,DeprecatedDateString) are used throughout the codebase but not yet available in a tagged release. To improve maintainability and ensure this pin isn't forgotten, add a comment in the CMake file noting the target CLP release version for this upgrade.🤖 Prompt for AI Agents