-
-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(mep): support time bucketing in queries #2937
Changes from all commits
a2cf9e9
8f25780
ff9551e
b1a5528
1b8f5ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from typing import NamedTuple, Sequence | ||
|
||
from snuba.datasets.metrics import DEFAULT_GRANULARITY | ||
from snuba.query.conditions import ConditionFunctions, binary_condition | ||
from snuba.query.exceptions import InvalidGranularityException | ||
|
@@ -39,3 +41,61 @@ def process_query(self, query: Query, query_settings: QuerySettings) -> None: | |
Literal(None, granularity), | ||
) | ||
) | ||
|
||
|
||
class GranularityMapping(NamedTuple): | ||
raw: int | ||
enum_value: int | ||
|
||
|
||
PERFORMANCE_GRANULARITIES: Sequence[GranularityMapping] = [ | ||
GranularityMapping(60, 1), | ||
GranularityMapping(3600, 2), | ||
GranularityMapping(86400, 3), | ||
] | ||
DEFAULT_MAPPED_GRANULARITY_ENUM = 1 | ||
|
||
|
||
class MappedGranularityProcessor(QueryProcessor): | ||
""" | ||
Use the granularity set on the query to filter on the granularity column, | ||
supporting generic-metrics style enum mapping | ||
""" | ||
|
||
def __init__( | ||
self, | ||
accepted_granularities: Sequence[GranularityMapping], | ||
default_granularity_enum: int, | ||
): | ||
self._accepted_granularities = sorted( | ||
accepted_granularities, key=lambda mapping: mapping.raw, reverse=True | ||
) | ||
self._available_granularities_values = [ | ||
mapping.raw for mapping in self._accepted_granularities | ||
] | ||
self._default_granularity_enum = default_granularity_enum | ||
|
||
def __get_granularity(self, query: Query) -> int: | ||
"""Find the best fitting granularity for this query""" | ||
requested_granularity = query.get_granularity() | ||
|
||
if requested_granularity is None: | ||
return self._default_granularity_enum | ||
elif requested_granularity > 0: | ||
for mapping in self._accepted_granularities: | ||
if requested_granularity % mapping.raw == 0: | ||
return mapping.enum_value | ||
|
||
raise InvalidGranularityException( | ||
f"Granularity must be multiple of one of {self._available_granularities_values}" | ||
) | ||
|
||
def process_query(self, query: Query, query_settings: QuerySettings) -> None: | ||
granularity = self.__get_granularity(query) | ||
query.add_condition_to_ast( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is just a matter of adding the condition to the query? Is there no existing granularity condition to be removed/replaced? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, that's correct There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is that though? What was the mechanism by which the granularity was being set on the query previously and wouldn't that still be happening? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it wasn't being set previously on generic_metrics unless you added a |
||
binary_condition( | ||
ConditionFunctions.EQ, | ||
Column(None, None, "granularity"), | ||
Literal(None, granularity), | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -143,7 +143,7 @@ def test_retrieval_basic(self) -> None: | |
AND metric_id = {self.metric_id} | ||
AND timestamp >= toDateTime('{self.start_time}') | ||
AND timestamp < toDateTime('{self.end_time}') | ||
GRANULARITY 1 | ||
GRANULARITY 60 | ||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
|
@@ -180,7 +180,7 @@ def test_raw_tags(self) -> None: | |
AND tags_raw[{tag_key}] = '{value_as_string}' | ||
AND timestamp >= toDateTime('{self.start_time}') | ||
AND timestamp < toDateTime('{self.end_time}') | ||
GRANULARITY 1 | ||
GRANULARITY 60 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think I follow why these queries changed? Are you saying they were wrong before? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to remove that second condition so now they will be wrong, but I just wanted to verify that we're handling the "raw" granularities correctly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which one are you saying is the correct value and which one is the wrong one? And if you're saying it will be wrong after this PR can you give some info about why and how will it be addressed in future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I originally put up the PR, I was thinking that both providing raw and enum values were correct. I've reconsidered and I now think we should accept only the "raw" (in seconds) granularity value to reduce confusion. Our ideal is to take this non-infrastructure concern and move it to the product side (e.g. sentry should maintain the list of enum values for granularities it cares about and use them directly in queries). They don't have the flexibility in their query layer at the moment, though, to have two different approaches for building requests. How it will be addressed in the future is undetermined but they're aware that we don't like having this logic in our side. |
||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
|
@@ -217,7 +217,7 @@ def test_indexed_tags(self) -> None: | |
AND tags[{tag_key}] = {tag_idx_value} | ||
AND timestamp >= toDateTime('{self.start_time}') | ||
AND timestamp < toDateTime('{self.end_time}') | ||
GRANULARITY 1 | ||
GRANULARITY 60 | ||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
|
@@ -268,6 +268,9 @@ def setup_method(self, test_method: Any) -> None: | |
self.end_time = ( | ||
self.base_time + timedelta(seconds=self.count) + timedelta(seconds=10) | ||
) | ||
self.hour_before_start_time = self.start_time - timedelta(hours=1) | ||
self.hour_after_start_time = self.start_time + timedelta(hours=1) | ||
|
||
self.generate_dists() | ||
|
||
def generate_dists(self) -> None: | ||
|
@@ -308,7 +311,7 @@ def test_retrieval_basic(self) -> None: | |
AND metric_id = {self.metric_id} | ||
AND timestamp >= toDateTime('{self.start_time}') | ||
AND timestamp < toDateTime('{self.end_time}') | ||
GRANULARITY 1 | ||
GRANULARITY 60 | ||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
|
@@ -333,7 +336,7 @@ def test_retrieval_percentiles(self) -> None: | |
AND metric_id = {self.metric_id} | ||
AND timestamp >= toDateTime('{self.start_time}') | ||
AND timestamp < toDateTime('{self.end_time}') | ||
GRANULARITY 1 | ||
GRANULARITY 60 | ||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
|
@@ -349,3 +352,30 @@ def test_retrieval_percentiles(self) -> None: | |
assert aggregation["org_id"] == self.org_id | ||
assert aggregation["project_id"] == self.project_id | ||
assert aggregation["quants"] == [2.0, approx(4.0), approx(4.0), approx(4.0)] | ||
|
||
def test_arbitrary_granularity(self) -> None: | ||
query_str = f"""MATCH (generic_metrics_distributions) | ||
SELECT quantiles(0.5,0.9,0.95,0.99)(value) AS quants, min(bucketed_time) AS min_time | ||
BY project_id, org_id | ||
WHERE org_id = {self.org_id} | ||
AND project_id = {self.project_id} | ||
AND metric_id = {self.metric_id} | ||
AND timestamp >= toDateTime('{self.hour_before_start_time}') | ||
AND timestamp < toDateTime('{self.hour_after_start_time}') | ||
GRANULARITY 3600 | ||
""" | ||
response = self.app.post( | ||
SNQL_ROUTE, | ||
data=json.dumps({"query": query_str, "dataset": "generic_metrics"}), | ||
) | ||
data = json.loads(response.data) | ||
|
||
assert response.status_code == 200 | ||
assert len(data["data"]) == 1, data | ||
|
||
aggregation = data["data"][0] | ||
smallest_time_bucket = datetime.strptime( | ||
aggregation["min_time"], "%Y-%m-%dT%H:%M:%S+00:00" | ||
) | ||
assert smallest_time_bucket.hour == 12 | ||
assert smallest_time_bucket.minute == 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need this processor if you have the granularity processor above?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, because the product wants to round timestamps to odd intervals like 15m or 3h or whatever which isn't handled by the granularity processor alone