Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mep): support time bucketing in queries #2937

Merged
merged 5 commits into from
Jul 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion snuba/datasets/entities/generic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@
)
from snuba.pipeline.simple_pipeline import SimplePipelineBuilder
from snuba.query.processors import QueryProcessor
from snuba.query.processors.granularity_processor import (
DEFAULT_MAPPED_GRANULARITY_ENUM,
PERFORMANCE_GRANULARITIES,
MappedGranularityProcessor,
)
from snuba.query.processors.timeseries_processor import TimeSeriesProcessor
from snuba.query.validation.validators import (
EntityRequiredColumnValidator,
QueryValidator,
Expand Down Expand Up @@ -97,7 +103,14 @@ def __init__(
)

def get_query_processors(self) -> Sequence[QueryProcessor]:
return [TagsTypeTransformer()]
return [
TagsTypeTransformer(),
MappedGranularityProcessor(
accepted_granularities=PERFORMANCE_GRANULARITIES,
default_granularity_enum=DEFAULT_MAPPED_GRANULARITY_ENUM,
),
TimeSeriesProcessor({"bucketed_time": "timestamp"}, ("timestamp",)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need this processor if you have the granularity processor above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, because the product wants to round timestamps to odd intervals like 15m or 3h or whatever which isn't handled by the granularity processor alone

]


class GenericMetricsSetsEntity(GenericMetricsEntity):
Expand Down
60 changes: 60 additions & 0 deletions snuba/query/processors/granularity_processor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import NamedTuple, Sequence

from snuba.datasets.metrics import DEFAULT_GRANULARITY
from snuba.query.conditions import ConditionFunctions, binary_condition
from snuba.query.exceptions import InvalidGranularityException
Expand Down Expand Up @@ -39,3 +41,61 @@ def process_query(self, query: Query, query_settings: QuerySettings) -> None:
Literal(None, granularity),
)
)


class GranularityMapping(NamedTuple):
raw: int
enum_value: int


PERFORMANCE_GRANULARITIES: Sequence[GranularityMapping] = [
GranularityMapping(60, 1),
GranularityMapping(3600, 2),
GranularityMapping(86400, 3),
]
DEFAULT_MAPPED_GRANULARITY_ENUM = 1


class MappedGranularityProcessor(QueryProcessor):
"""
Use the granularity set on the query to filter on the granularity column,
supporting generic-metrics style enum mapping
"""

def __init__(
self,
accepted_granularities: Sequence[GranularityMapping],
default_granularity_enum: int,
):
self._accepted_granularities = sorted(
accepted_granularities, key=lambda mapping: mapping.raw, reverse=True
)
self._available_granularities_values = [
mapping.raw for mapping in self._accepted_granularities
]
self._default_granularity_enum = default_granularity_enum

def __get_granularity(self, query: Query) -> int:
"""Find the best fitting granularity for this query"""
requested_granularity = query.get_granularity()

if requested_granularity is None:
return self._default_granularity_enum
elif requested_granularity > 0:
for mapping in self._accepted_granularities:
if requested_granularity % mapping.raw == 0:
return mapping.enum_value

raise InvalidGranularityException(
f"Granularity must be multiple of one of {self._available_granularities_values}"
)

def process_query(self, query: Query, query_settings: QuerySettings) -> None:
granularity = self.__get_granularity(query)
query.add_condition_to_ast(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is just a matter of adding the condition to the query? Is there no existing granularity condition to be removed/replaced?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that's correct

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is that though? What was the mechanism by which the granularity was being set on the query previously and wouldn't that still be happening?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it wasn't being set previously on generic_metrics unless you added a where clause with the granularity column

binary_condition(
ConditionFunctions.EQ,
Column(None, None, "granularity"),
Literal(None, granularity),
)
)
75 changes: 74 additions & 1 deletion tests/query/processors/test_granularity_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from snuba.query.exceptions import InvalidGranularityException
from snuba.query.expressions import Column, Literal
from snuba.query.logical import Query
from snuba.query.processors.granularity_processor import GranularityProcessor
from snuba.query.processors.granularity_processor import (
DEFAULT_MAPPED_GRANULARITY_ENUM,
PERFORMANCE_GRANULARITIES,
GranularityProcessor,
MappedGranularityProcessor,
)
from snuba.query.query_settings import HTTPQuerySettings


Expand Down Expand Up @@ -83,3 +88,71 @@ def test_granularity_added(
),
granularity=(requested_granularity),
)


@pytest.mark.parametrize(
"entity_key,column",
[
(EntityKey.GENERIC_METRICS_DISTRIBUTIONS, "percentiles"),
(EntityKey.GENERIC_METRICS_SETS, "value"),
],
)
@pytest.mark.parametrize(
"requested_granularity, query_granularity",
[
(None, 1),
(10, None),
(60, 1),
(90, None),
(120, 1),
(60 * 60, 2),
(90 * 60, 1),
(120 * 60, 2),
(24 * 60 * 60, 3),
(32 * 60 * 60, 2),
(48 * 60 * 60, 3),
(13, None),
(0, None),
],
)
def test_granularity_enum_mapping(
entity_key: EntityKey,
column: str,
requested_granularity: Optional[int],
query_granularity: int,
) -> None:
query = Query(
QueryEntity(entity_key, ColumnSet([])),
selected_columns=[SelectedExpression(column, Column(None, None, column))],
condition=binary_condition(
ConditionFunctions.EQ, Column(None, None, "metric_id"), Literal(None, 123)
),
granularity=(requested_granularity),
)

try:
MappedGranularityProcessor(
accepted_granularities=PERFORMANCE_GRANULARITIES,
default_granularity_enum=DEFAULT_MAPPED_GRANULARITY_ENUM,
).process_query(query, HTTPQuerySettings())
except InvalidGranularityException:
assert query_granularity is None
else:
assert query == Query(
QueryEntity(entity_key, ColumnSet([])),
selected_columns=[SelectedExpression(column, Column(None, None, column))],
condition=binary_condition(
BooleanFunctions.AND,
binary_condition(
ConditionFunctions.EQ,
Column(None, None, "granularity"),
Literal(None, query_granularity),
),
binary_condition(
ConditionFunctions.EQ,
Column(None, None, "metric_id"),
Literal(None, 123),
),
),
granularity=(requested_granularity),
)
40 changes: 35 additions & 5 deletions tests/test_generic_metrics_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def test_retrieval_basic(self) -> None:
AND metric_id = {self.metric_id}
AND timestamp >= toDateTime('{self.start_time}')
AND timestamp < toDateTime('{self.end_time}')
GRANULARITY 1
GRANULARITY 60
"""
response = self.app.post(
SNQL_ROUTE,
Expand Down Expand Up @@ -180,7 +180,7 @@ def test_raw_tags(self) -> None:
AND tags_raw[{tag_key}] = '{value_as_string}'
AND timestamp >= toDateTime('{self.start_time}')
AND timestamp < toDateTime('{self.end_time}')
GRANULARITY 1
GRANULARITY 60
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I follow why these queries changed? Are you saying they were wrong before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to remove that second condition so now they will be wrong, but I just wanted to verify that we're handling the "raw" granularities correctly

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which one are you saying is the correct value and which one is the wrong one? And if you're saying it will be wrong after this PR can you give some info about why and how will it be addressed in future?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I originally put up the PR, I was thinking that both providing raw and enum values were correct. I've reconsidered and I now think we should accept only the "raw" (in seconds) granularity value to reduce confusion.

Our ideal is to take this non-infrastructure concern and move it to the product side (e.g. sentry should maintain the list of enum values for granularities it cares about and use them directly in queries). They don't have the flexibility in their query layer at the moment, though, to have two different approaches for building requests.

How it will be addressed in the future is undetermined but they're aware that we don't like having this logic in our side.

"""
response = self.app.post(
SNQL_ROUTE,
Expand Down Expand Up @@ -217,7 +217,7 @@ def test_indexed_tags(self) -> None:
AND tags[{tag_key}] = {tag_idx_value}
AND timestamp >= toDateTime('{self.start_time}')
AND timestamp < toDateTime('{self.end_time}')
GRANULARITY 1
GRANULARITY 60
"""
response = self.app.post(
SNQL_ROUTE,
Expand Down Expand Up @@ -268,6 +268,9 @@ def setup_method(self, test_method: Any) -> None:
self.end_time = (
self.base_time + timedelta(seconds=self.count) + timedelta(seconds=10)
)
self.hour_before_start_time = self.start_time - timedelta(hours=1)
self.hour_after_start_time = self.start_time + timedelta(hours=1)

self.generate_dists()

def generate_dists(self) -> None:
Expand Down Expand Up @@ -308,7 +311,7 @@ def test_retrieval_basic(self) -> None:
AND metric_id = {self.metric_id}
AND timestamp >= toDateTime('{self.start_time}')
AND timestamp < toDateTime('{self.end_time}')
GRANULARITY 1
GRANULARITY 60
"""
response = self.app.post(
SNQL_ROUTE,
Expand All @@ -333,7 +336,7 @@ def test_retrieval_percentiles(self) -> None:
AND metric_id = {self.metric_id}
AND timestamp >= toDateTime('{self.start_time}')
AND timestamp < toDateTime('{self.end_time}')
GRANULARITY 1
GRANULARITY 60
"""
response = self.app.post(
SNQL_ROUTE,
Expand All @@ -349,3 +352,30 @@ def test_retrieval_percentiles(self) -> None:
assert aggregation["org_id"] == self.org_id
assert aggregation["project_id"] == self.project_id
assert aggregation["quants"] == [2.0, approx(4.0), approx(4.0), approx(4.0)]

def test_arbitrary_granularity(self) -> None:
query_str = f"""MATCH (generic_metrics_distributions)
SELECT quantiles(0.5,0.9,0.95,0.99)(value) AS quants, min(bucketed_time) AS min_time
BY project_id, org_id
WHERE org_id = {self.org_id}
AND project_id = {self.project_id}
AND metric_id = {self.metric_id}
AND timestamp >= toDateTime('{self.hour_before_start_time}')
AND timestamp < toDateTime('{self.hour_after_start_time}')
GRANULARITY 3600
"""
response = self.app.post(
SNQL_ROUTE,
data=json.dumps({"query": query_str, "dataset": "generic_metrics"}),
)
data = json.loads(response.data)

assert response.status_code == 200
assert len(data["data"]) == 1, data

aggregation = data["data"][0]
smallest_time_bucket = datetime.strptime(
aggregation["min_time"], "%Y-%m-%dT%H:%M:%S+00:00"
)
assert smallest_time_bucket.hour == 12
assert smallest_time_bucket.minute == 0