-
Couldn't load subscription status.
- Fork 405
Fix LaterGauge metrics to collect from all servers
#18751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
33d6a3c
c90b5f8
35be852
2b92d11
acc5ecc
5e4a0dc
2b9c84b
193eba2
770bb81
f569e29
0a710cb
2da500f
5efae1e
11cf4e8
b4479cd
977fe9e
d967c9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Fix `LaterGauge` metrics to collect from all servers. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| Dict, | ||
| Generic, | ||
| Iterable, | ||
| List, | ||
| Mapping, | ||
| Optional, | ||
| Sequence, | ||
|
|
@@ -73,8 +74,6 @@ | |
|
|
||
| METRICS_PREFIX = "/_synapse/metrics" | ||
|
|
||
| all_gauges: Dict[str, Collector] = {} | ||
|
|
||
| HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat") | ||
|
|
||
| SERVER_NAME_LABEL = "server_name" | ||
|
|
@@ -163,42 +162,47 @@ class LaterGauge(Collector): | |
| name: str | ||
devonh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| desc: str | ||
| labelnames: Optional[StrSequence] = attr.ib(hash=False) | ||
| # callback: should either return a value (if there are no labels for this metric), | ||
| # or dict mapping from a label tuple to a value | ||
| caller: Callable[ | ||
| [], Union[Mapping[Tuple[str, ...], Union[int, float]], Union[int, float]] | ||
| ] | ||
| # List of callbacks: each callback should either return a value (if there are no | ||
| # labels for this metric), or dict mapping from a label tuple to a value | ||
| _hooks: List[ | ||
| Callable[ | ||
| [], Union[Mapping[Tuple[str, ...], Union[int, float]], Union[int, float]] | ||
| ] | ||
| ] = attr.ib(factory=list, hash=False) | ||
|
|
||
| def collect(self) -> Iterable[Metric]: | ||
| # The decision to add `SERVER_NAME_LABEL` is from the `LaterGauge` usage itself | ||
| # (we don't enforce it here, one level up). | ||
| g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) # type: ignore[missing-server-name-label] | ||
|
|
||
| try: | ||
| calls = self.caller() | ||
| except Exception: | ||
| logger.exception("Exception running callback for LaterGauge(%s)", self.name) | ||
| yield g | ||
| return | ||
| for hook in self._hooks: | ||
| try: | ||
| hook_result = hook() | ||
| except Exception: | ||
| logger.exception( | ||
| "Exception running callback for LaterGauge(%s)", self.name | ||
| ) | ||
| yield g | ||
| return | ||
|
|
||
| if isinstance(hook_result, (int, float)): | ||
| g.add_metric([], hook_result) | ||
| else: | ||
| for k, v in hook_result.items(): | ||
| g.add_metric(k, v) | ||
|
|
||
| if isinstance(calls, (int, float)): | ||
| g.add_metric([], calls) | ||
| else: | ||
| for k, v in calls.items(): | ||
| g.add_metric(k, v) | ||
| yield g | ||
|
|
||
| yield g | ||
| def register_hook( | ||
| self, | ||
| hook: Callable[ | ||
| [], Union[Mapping[Tuple[str, ...], Union[int, float]], Union[int, float]] | ||
| ], | ||
| ) -> None: | ||
| self._hooks.append(hook) | ||
|
|
||
| def __attrs_post_init__(self) -> None: | ||
| self._register() | ||
|
|
||
| def _register(self) -> None: | ||
| if self.name in all_gauges.keys(): | ||
| logger.warning("%s already registered, reregistering", self.name) | ||
| REGISTRY.unregister(all_gauges.pop(self.name)) | ||
|
|
||
| REGISTRY.register(self) | ||
| all_gauges[self.name] = self | ||
|
Comment on lines
-195
to
-201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Huh - that also solves it. @erikjohnston it looks like you last touched this "reregistering" stuff back 7 years ago. Logically, it makes sense to me that we shouldn't be reregistering the same metrics multiple times within Synapse. Anything that does must surely be a bug. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's fine. I wonder if we should explicitly error if we try and reregister the same metric name then? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on our experience after merging this PR (see #18789 where this PR was reverted), I'm guessing this might have been added because we create many many homeservers in our trial tests ( |
||
|
|
||
|
|
||
| # `MetricsEntry` only makes sense when it is a `Protocol`, | ||
|
|
@@ -250,7 +254,7 @@ def __init__( | |
| # Protects access to _registrations | ||
| self._lock = threading.Lock() | ||
|
|
||
| self._register_with_collector() | ||
| REGISTRY.register(self) | ||
|
|
||
| def register( | ||
| self, | ||
|
|
@@ -341,14 +345,6 @@ def collect(self) -> Iterable[Metric]: | |
| gauge.add_metric(labels=key, value=getattr(metrics, name)) | ||
| yield gauge | ||
|
|
||
| def _register_with_collector(self) -> None: | ||
| if self.name in all_gauges.keys(): | ||
| logger.warning("%s already registered, reregistering", self.name) | ||
| REGISTRY.unregister(all_gauges.pop(self.name)) | ||
|
|
||
| REGISTRY.register(self) | ||
| all_gauges[self.name] = self | ||
|
|
||
|
|
||
| class GaugeHistogramMetricFamilyWithLabels(GaugeHistogramMetricFamily): | ||
| """ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the changelog is already underway in https://github.com/element-hq/synapse/blob/release-v1.136/CHANGES.md, I'm going to leave this as-is as it will require manual tinkering anyway to pull into
v1.136.0and we don't need to re-run CI again.The desire is for this changelog to merge with #18714
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good to me. I'm already manually tinkering away in there