-
Notifications
You must be signed in to change notification settings - Fork 45
feat[healthcheck]: new health endpoint #807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
f3fc379
feat: new /health endpoint
luislhl e4e0540
fix: api docs
luislhl fccf60c
feat: create a 'strict_status_code' query param
luislhl 17c09de
chore: move the healthcheck tests to the new endpoint
luislhl 4e4b604
chore: test for the 'strict_status_code' parm
luislhl 551559e
fix: remove old todo
luislhl 7962d44
style: use single quotation marks in the code
luislhl File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| from abc import ABC, abstractmethod | ||
| from dataclasses import dataclass | ||
| from datetime import datetime | ||
| from enum import Enum | ||
| from typing import Any, Optional | ||
|
|
||
|
|
||
| class ComponentType(str, Enum): | ||
| """Enum used to store the component types that can be used in the HealthCheckComponentStatus class.""" | ||
|
|
||
| DATASTORE = 'datastore' | ||
| INTERNAL = 'internal' | ||
| FULLNODE = 'fullnode' | ||
|
|
||
|
|
||
| class HealthCheckStatus(str, Enum): | ||
| """Enum used to store the component status that can be used in the HealthCheckComponentStatus class.""" | ||
|
|
||
| PASS = 'pass' | ||
| WARN = 'warn' | ||
| FAIL = 'fail' | ||
|
|
||
|
|
||
| @dataclass | ||
| class ComponentHealthCheck: | ||
| """This class is used to store the result of a health check in a specific component.""" | ||
|
|
||
| component_name: str | ||
| component_type: ComponentType | ||
| status: HealthCheckStatus | ||
| output: str | ||
| time: Optional[str] = None | ||
| component_id: Optional[str] = None | ||
| observed_value: Optional[str] = None | ||
| observed_unit: Optional[str] = None | ||
|
|
||
| def __post_init__(self) -> None: | ||
| self.time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') | ||
|
|
||
| def to_json(self) -> dict[str, str]: | ||
| """Return a dict representation of the object. All field names are converted to camel case.""" | ||
| json = { | ||
| 'componentType': self.component_type.value, | ||
| 'status': self.status.value, | ||
| 'output': self.output, | ||
| } | ||
|
|
||
| if self.time: | ||
| json['time'] = self.time | ||
|
|
||
| if self.component_id: | ||
| json['componentId'] = self.component_id | ||
|
|
||
| if self.observed_value: | ||
| assert ( | ||
| self.observed_unit is not None | ||
| ), 'observed_unit must be set if observed_value is set' | ||
|
|
||
| json['observedValue'] = self.observed_value | ||
| json['observedUnit'] = self.observed_unit | ||
|
|
||
| return json | ||
|
|
||
|
|
||
| @dataclass | ||
| class ServiceHealthCheck: | ||
| """This class is used to store the result of a service health check.""" | ||
|
|
||
| description: str | ||
| checks: dict[str, list[ComponentHealthCheck]] | ||
|
|
||
| @property | ||
| def status(self) -> HealthCheckStatus: | ||
| """Return the status of the health check based on the status of the components.""" | ||
| status = HealthCheckStatus.PASS | ||
|
|
||
| for component_checks in self.checks.values(): | ||
| for check in component_checks: | ||
| if check.status == HealthCheckStatus.FAIL: | ||
| return HealthCheckStatus.FAIL | ||
| elif check.status == HealthCheckStatus.WARN: | ||
| status = HealthCheckStatus.WARN | ||
|
|
||
| return status | ||
|
|
||
| def __post_init__(self) -> None: | ||
| """Perform some validations after the object is initialized.""" | ||
| # Make sure the checks dict is not empty | ||
| if not self.checks: | ||
| raise ValueError('checks dict cannot be empty') | ||
|
|
||
| def get_http_status_code(self) -> int: | ||
| """Return the HTTP status code for the status.""" | ||
| if self.status in [HealthCheckStatus.PASS]: | ||
| return 200 | ||
| elif self.status in [HealthCheckStatus.WARN, HealthCheckStatus.FAIL]: | ||
| return 503 | ||
| else: | ||
| raise ValueError(f'Missing treatment for status {self.status}') | ||
|
|
||
| def to_json(self) -> dict[str, Any]: | ||
| """Return a dict representation of the object. All field names are converted to camel case.""" | ||
| return { | ||
| 'status': self.status.value, | ||
| 'description': self.description, | ||
| 'checks': {k: [c.to_json() for c in v] for k, v in self.checks.items()}, | ||
| } | ||
|
|
||
|
|
||
| class ComponentHealthCheckInterface(ABC): | ||
| """This is an interface to be used by other classes implementing health checks for components.""" | ||
|
|
||
| @abstractmethod | ||
| async def get_health_check(self) -> ComponentHealthCheck: | ||
| """Return the health check status for the component.""" | ||
| raise NotImplementedError() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| # Copyright 2021 Hathor Labs | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from hathor.healthcheck.resources.healthcheck import HealthcheckResource | ||
|
|
||
| __all__ = [ | ||
| 'HealthcheckResource', | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| import hathor | ||
| from hathor.api_util import Resource, get_arg_default, get_args | ||
| from hathor.cli.openapi_files.register import register_resource | ||
| from hathor.healthcheck.models import ComponentHealthCheck, ComponentType, HealthCheckStatus, ServiceHealthCheck | ||
| from hathor.manager import HathorManager | ||
| from hathor.util import json_dumpb | ||
|
|
||
|
|
||
| def build_sync_health_status(manager: HathorManager) -> ComponentHealthCheck: | ||
| """Builds the sync health status object.""" | ||
| healthy, reason = manager.is_sync_healthy() | ||
|
|
||
| return ComponentHealthCheck( | ||
| component_name='sync', | ||
| component_type=ComponentType.INTERNAL, | ||
| status=HealthCheckStatus.PASS if healthy else HealthCheckStatus.FAIL, | ||
| output=reason or 'Healthy', | ||
| ) | ||
|
|
||
|
|
||
| @register_resource | ||
| class HealthcheckResource(Resource): | ||
| isLeaf = True | ||
|
|
||
| def __init__(self, manager: HathorManager): | ||
| self.manager = manager | ||
|
|
||
| def render_GET(self, request): | ||
| """ GET request /health/ | ||
| Returns the health status of the fullnode | ||
|
|
||
| The 'strict_status_code' argument can be used to return 200 even if the fullnode is unhealthy. | ||
| This can be useful when integrating with tools that could prefer to pass the response code only | ||
| in case the response is 200. | ||
|
|
||
| :rtype: string (json) | ||
| """ | ||
| raw_args = get_args(request) | ||
| strict_status_code = get_arg_default(raw_args, 'strict_status_code', '0') == '1' | ||
|
|
||
| components_health_checks = [ | ||
| build_sync_health_status(self.manager) | ||
| ] | ||
|
|
||
| health_check = ServiceHealthCheck( | ||
| description=f'Hathor-core {hathor.__version__}', | ||
| checks={c.component_name: [c] for c in components_health_checks}, | ||
| ) | ||
|
|
||
| if strict_status_code: | ||
| request.setResponseCode(200) | ||
| else: | ||
| status_code = health_check.get_http_status_code() | ||
| request.setResponseCode(status_code) | ||
|
|
||
| return json_dumpb(health_check.to_json()) | ||
|
|
||
|
|
||
| HealthcheckResource.openapi = { | ||
| '/health': { | ||
| 'x-visibility': 'public', | ||
| 'x-rate-limit': { | ||
| 'global': [ | ||
| { | ||
| 'rate': '10r/s', | ||
| 'burst': 10, | ||
| 'delay': 5 | ||
| } | ||
| ], | ||
| 'per-ip': [ | ||
| { | ||
| 'rate': '1r/s', | ||
| 'burst': 3, | ||
| 'delay': 2 | ||
| } | ||
| ] | ||
| }, | ||
| 'get': { | ||
| 'tags': ['healthcheck'], | ||
| 'operationId': 'get', | ||
| 'summary': 'Health status of the fullnode', | ||
| 'description': ''' | ||
| Returns 200 if the fullnode should be considered healthy. | ||
|
|
||
| Returns 503 otherwise. The response will contain the components that were considered for the healthcheck | ||
| and the reason why they were unhealthy. | ||
|
|
||
| Returning 503 with a response body is not the standard behavior for our API, but it was chosen because | ||
| most healthcheck tools expect a 503 response code to indicate that the service is unhealthy. | ||
|
|
||
| Optionally, there is a query parameter 'strict_status_code' that can be used to return 200 even if the fullnode | ||
| is unhealthy. When its value is 1, the response will always be 200. | ||
|
|
||
| We currently perform 2 checks in the sync mechanism for the healthcheck: | ||
| 1. Whether the fullnode has recent block activity, i.e. if the fullnode has blocks with recent timestamps. | ||
| 2. Whether the fullnode has at least one synced peer | ||
| ''', | ||
| 'parameters': [ | ||
| { | ||
| 'name': 'strict_status_code', | ||
| 'in': 'query', | ||
| 'description': 'Enables strict status code. If set to 1, the response will always be 200.', | ||
| 'required': False, | ||
| 'schema': { | ||
| 'type': 'string' | ||
| } | ||
| }, | ||
| ], | ||
| 'responses': { | ||
| '200': { | ||
| 'description': 'Healthy', | ||
| 'content': { | ||
| 'application/json': { | ||
| 'examples': { | ||
| 'healthy': { | ||
| 'summary': 'Healthy node', | ||
| 'value': { | ||
| 'status': 'pass', | ||
| 'description': 'Hathor-core v0.56.0', | ||
| 'checks': { | ||
| 'sync': [ | ||
| { | ||
| 'componentName': 'sync', | ||
| 'componentType': 'internal', | ||
| 'status': 'pass', | ||
| 'output': 'Healthy' | ||
| } | ||
| ] | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| '503': { | ||
| 'description': 'Unhealthy', | ||
| 'content': { | ||
| 'application/json': { | ||
| 'examples': { | ||
| 'no_recent_activity': { | ||
| 'summary': 'Node with no recent activity', | ||
| 'value': { | ||
| 'status': 'fail', | ||
| 'description': 'Hathor-core v0.56.0', | ||
| 'checks': { | ||
| 'sync': [ | ||
| { | ||
| 'componentName': 'sync', | ||
| 'componentType': 'internal', | ||
| 'status': 'fail', | ||
| 'output': 'Node doesn\'t have recent blocks' | ||
| } | ||
| ] | ||
| } | ||
| } | ||
| }, | ||
| 'no_synced_peer': { | ||
| 'summary': 'Node with no synced peer', | ||
| 'value': { | ||
| 'status': 'fail', | ||
| 'description': 'Hathor-core v0.56.0', | ||
| 'checks': { | ||
| 'sync': [ | ||
| { | ||
| 'componentName': 'sync', | ||
| 'componentType': 'internal', | ||
| 'status': 'fail', | ||
| 'output': 'Node doesn\'t have a synced peer' | ||
| } | ||
| ] | ||
| } | ||
| } | ||
| }, | ||
| 'peer_best_block_far_ahead': { | ||
| 'summary': 'Peer with best block too far ahead', | ||
| 'value': { | ||
| 'status': 'fail', | ||
| 'description': 'Hathor-core v0.56.0', | ||
| 'checks': { | ||
| 'sync': [ | ||
| { | ||
| 'componentName': 'sync', | ||
| 'componentType': 'internal', | ||
| 'status': 'fail', | ||
| 'output': 'Node\'s peer with highest height is too far ahead.' | ||
| } | ||
| ] | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| } | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.