Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions custom_components/givenergy_local/config_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import voluptuous as vol
from givenergy_modbus.client.client import Client
from givenergy_modbus.exceptions import RefreshPartiallySucceeded
from homeassistant.config_entries import ConfigFlow, ConfigFlowResult
from homeassistant.const import CONF_HOST, CONF_PORT

Expand Down Expand Up @@ -94,11 +95,25 @@ async def _test_connection(self, host: str, port: int) -> tuple[str, str | None]
try:
await client.connect()
# detect() resolves the device model and topology before any reads
# so refresh_plant() picks the right register layout (single vs.
# so refresh() picks the right register layout (single vs.
# three-phase) from the first request.
await client.detect()
plant = await client.refresh_plant(full_refresh=False)
return plant.inverter_serial_number, None
try:
plant = await client.refresh()
except RefreshPartiallySucceeded as exc:
# A connectivity probe only needs to identify the inverter
# (device 0x32), which is virtually always among the successful
# reads — a partial usually means a peripheral battery/meter
# dropped. A usable snapshot is enough here; RefreshFailed (no
# data at all) falls through to "cannot_connect" below.
plant = exc.plant
serial = plant.inverter_serial_number
if not serial:
# The partial dropped the inverter read itself — no usable
# unique ID, so treat it as a failed connection rather than
# creating an entry with an empty serial.
return "", "cannot_connect"
return serial, None
except Exception:
_LOGGER.exception("Connection test failed for %s:%s", host, port)
return "", "cannot_connect"
Expand Down
151 changes: 124 additions & 27 deletions custom_components/givenergy_local/coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
from datetime import datetime, timedelta

from givenergy_modbus.client.client import Client
from givenergy_modbus.exceptions import PlantTopologyMismatch
from givenergy_modbus.exceptions import (
PlantTopologyMismatch,
ReadFailure,
RefreshFailed,
RefreshPartiallySucceeded,
)
from givenergy_modbus.model.inverter import SinglePhaseInverter
from givenergy_modbus.model.inverter_threephase import ThreePhaseInverter
from givenergy_modbus.model.plant import Plant, PlantCapabilities
Expand Down Expand Up @@ -79,6 +84,15 @@ def __init__(
self.last_successful_refresh: datetime | None = None
self.consecutive_failures: int = 0
self.total_failures: int = 0
# Cumulative count of polls that returned *some* data but had one or
# more register reads fail (RefreshPartiallySucceeded). Distinct from
# total_failures (which counts polls that yielded no usable data) so a
# flaky single device — e.g. dodgy RS485 wiring to one battery — shows
# up here without eroding the hard-failure metrics.
self.partial_failures: int = 0
# The ReadFailure records from the most recent partial poll, surfaced as
# a diagnostic sensor attribute so users can see *which* device dropped.
self.last_partial_failures: list[ReadFailure] = []
self._last_inverter_time: datetime | None = None
self._unchanged_ticks: int = 0
self._active_tick: int = 0
Expand All @@ -100,39 +114,117 @@ async def _async_update_data(self) -> Plant:
else await self._active_update()
)

self._last_inverter_time = plant.inverter.system_time
self.last_successful_refresh = dt_util.utcnow()
self.consecutive_failures = 0
# A fully clean poll — clear any stale partial-failure detail so the
# diagnostic sensor stops naming devices that have since recovered.
# (The cumulative partial_failures counter is left untouched.)
self.last_partial_failures = []
self._mark_success(plant)
return plant
Comment thread
coderabbitai[bot] marked this conversation as resolved.
except RefreshPartiallySucceeded as exc:
if reconnecting:
# No reliable per-device fallback on a (re)connect seed: a
# half-populated initial plant (the dropped device reads
# "unknown") is worse than a clean full retry. detect() already
# gated device presence, so a partial seed is most likely a
# transient hiccup on a present device. Route through the same
# tolerance gate as a timeout — serves last-known data on an
# in-process reconnect, or raises UpdateFailed (→
# ConfigEntryNotReady) on a cold start so HA retries setup.
self._record_failure()
return await self._serve_last_known_or_fail("Partial data on (re)connect seed", exc)
# Steady state: serve the partial. The dropped device's last-known
# register values ride along (frozen) while the rest stay fresh —
# this is the behaviour change #125 buys us (one offline battery no
# longer discards every other reading for the tick).
self._record_partial(exc)
self._mark_success(exc.plant)
return exc.plant
except UpdateFailed:
self.consecutive_failures += 1
self.total_failures += 1
self._record_failure()
raise
except TimeoutError:
# Keep the client alive — timeouts are transient and the TCP
# connection is likely still valid.
self.consecutive_failures += 1
self.total_failures += 1
if self.data is None or self.consecutive_failures >= self.timeout_tolerance:
await self._reset_client()
raise UpdateFailed(
f"Timed out communicating with inverter "
f"({self.consecutive_failures} consecutive failures)"
except RefreshFailed as err:
self._record_failure()
if self._is_timeout_failure(err):
# Every read timed out — treat like the bare-TimeoutError path
# below: transient, keep the client and serve last-known data
# until the tolerance window is exhausted.
return await self._serve_last_known_or_fail(
"Timed out communicating with inverter", err
)
_LOGGER.warning(
"Timed out communicating with inverter (failure %d/%d); serving last known data",
self.consecutive_failures,
self.timeout_tolerance,
await self._reset_client()
raise UpdateFailed(f"Error communicating with inverter: {err}") from err
except TimeoutError as err:
# Defensive: a timeout not wrapped in RefreshFailed. Keep the client
# alive — timeouts are transient and the TCP connection is likely
# still valid.
self._record_failure()
return await self._serve_last_known_or_fail(
"Timed out communicating with inverter", err
)
return self.data
except Exception as err:
self.consecutive_failures += 1
self.total_failures += 1
self._record_failure()
await self._reset_client()
raise UpdateFailed(
f"Error communicating with inverter: {str(err) or type(err).__name__}"
) from err

# ------------------------------------------------------------------
# Failure / success bookkeeping
# ------------------------------------------------------------------

def _mark_success(self, plant: Plant) -> None:
"""Record a tick that yielded usable data (full or partial success)."""
self._last_inverter_time = plant.inverter.system_time
self.last_successful_refresh = dt_util.utcnow()
self.consecutive_failures = 0

def _record_failure(self) -> None:
"""Count a poll that yielded no usable data."""
self.consecutive_failures += 1
self.total_failures += 1

def _record_partial(self, exc: RefreshPartiallySucceeded) -> None:
"""Count a degraded-but-usable poll and surface which reads dropped."""
self.partial_failures += 1
self.last_partial_failures = exc.failures
_LOGGER.warning(
"Partial refresh: %d register read(s) failed; serving last-known "
"values for those banks. Failures: %s",
len(exc.failures),
exc.failures,
)

def _is_timeout_failure(self, err: RefreshFailed) -> bool:
"""True if every underlying cause of a RefreshFailed is a timeout.

Timeout-only failures are treated as transient (tolerance window);
anything else resets the client and fails immediately.
"""
cause = err.cause
if isinstance(cause, BaseExceptionGroup):
_, rest = cause.split(TimeoutError)
return rest is None
return isinstance(cause, TimeoutError)

async def _serve_last_known_or_fail(self, message: str, err: BaseException) -> Plant:
"""Serve last-known data within the tolerance window, else reset and fail.

Mirrors the long-standing timeout-tolerance behaviour: a transient blip
keeps the client and replays `self.data` up to `timeout_tolerance`
consecutive failures; past that (or with no data yet) the client is
reset and UpdateFailed raised.
"""
if self.data is not None and self.consecutive_failures < self.timeout_tolerance:
_LOGGER.warning(
"%s (failure %d/%d); serving last known data",
message,
self.consecutive_failures,
self.timeout_tolerance,
)
return self.data
await self._reset_client()
raise UpdateFailed(f"{message} ({self.consecutive_failures} consecutive failures)") from err

# ------------------------------------------------------------------
# Update strategies
# ------------------------------------------------------------------
Expand All @@ -143,12 +235,17 @@ async def _active_update(self) -> Plant:
Holding registers (config, charge slots, …) are re-read only every
_full_refresh_every ticks; input registers (real-time data) are read
every tick.

Raises RefreshPartiallySucceeded / RefreshFailed straight up to
_async_update_data, which owns the seed-vs-steady-state policy (it's the
only caller that knows whether this tick is a reconnect seed).
"""
assert self._client is not None # _async_update_data ensures this
full_refresh = self._active_tick % self._full_refresh_every == 0
self._active_tick += 1
await self._client.refresh_plant(full_refresh=full_refresh, retries=self.retries)
return self._client.plant
if full_refresh:
await self._client.load_config(retries=self.retries)
return await self._client.refresh(retries=self.retries)

async def _passive_update(self, reconnecting: bool) -> Plant:
"""Seed the cache on (re)connect; on subsequent ticks read the cached plant.
Expand All @@ -158,8 +255,8 @@ async def _passive_update(self, reconnecting: bool) -> Plant:
"""
assert self._client is not None # _async_update_data ensures this
if reconnecting:
await self._client.refresh_plant(full_refresh=True, retries=self.retries)
return self._client.plant
await self._client.load_config(retries=self.retries)
return await self._client.refresh(retries=self.retries)

plant = self._client.plant
self._check_cache_freshness(plant)
Expand Down
4 changes: 3 additions & 1 deletion custom_components/givenergy_local/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Increment whenever the generated YAML layout changes in a meaningful way.
# __init__.py compares this against the last-generated version stored in HA's
# persistent Store and raises a Repairs issue when they diverge.
DASHBOARD_VERSION = 2
DASHBOARD_VERSION = 3


def generate_dashboard(inv: str, bats: list[str], max_power_kw: int = 10) -> str:
Expand Down Expand Up @@ -363,6 +363,8 @@ def _diagnostics_view(inv: str, max_power_kw: int) -> str:
name: Last Successful Refresh
- entity: {_i(inv, "consecutive_refresh_failures")}
name: Consecutive Failures
- entity: {_i(inv, "partial_refresh_failures")}
name: Partial Failures
Comment thread
dewet22 marked this conversation as resolved.
- entity: {_i(inv, "total_refresh_failures")}
name: Total Failures

Expand Down
2 changes: 1 addition & 1 deletion custom_components/givenergy_local/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"iot_class": "local_polling",
"issue_tracker": "https://github.com/dewet22/givenergy-hass/issues",
"requirements": [
"givenergy-modbus>=2.1.0a4,<3.0.0"
"givenergy-modbus>=2.1.0a5,<3.0.0"
],
"version": "1.1.0"
}
43 changes: 43 additions & 0 deletions custom_components/givenergy_local/sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def fn(bat: Battery) -> Any:
@dataclass(frozen=True, kw_only=True)
class GivEnergyCoordinatorSensorDescription(SensorEntityDescription):
value_fn: Callable[[GivEnergyUpdateCoordinator], Any] = field(default=lambda _: None)
# None (not a dummy lambda) so sensors without attributes skip the call entirely.
attributes_fn: Callable[[GivEnergyUpdateCoordinator], dict[str, Any] | None] | None = None


INVERTER_SENSORS: tuple[GivEnergyInverterSensorDescription, ...] = (
Expand Down Expand Up @@ -895,6 +897,30 @@ class GivEnergyCoordinatorSensorDescription(SensorEntityDescription):
)


def _partial_failure_attributes(
coordinator: GivEnergyUpdateCoordinator,
) -> dict[str, Any] | None:
"""Summarise the most recent partial poll's failed reads for the UI.

Names the device(s) that dropped (e.g. "0x34" for a battery) plus the
per-bank detail, so a flaky device can be identified even though its
entities stay available with stale data.
"""
failures = coordinator.last_partial_failures
if not failures:
return None
return {
"last_failed_devices": sorted({f"0x{f.device_address:02x}" for f in failures}),
"last_failure_count": len(failures),
"last_failures": [
f"0x{f.device_address:02x} "
f"{getattr(f.request_type, 'value', f.request_type)} "
f"@ {f.base_register}+{f.register_count}"
for f in failures
],
Comment thread
dewet22 marked this conversation as resolved.
}


COORDINATOR_SENSORS: tuple[GivEnergyCoordinatorSensorDescription, ...] = (
GivEnergyCoordinatorSensorDescription(
key="last_successful_refresh",
Expand All @@ -920,6 +946,17 @@ class GivEnergyCoordinatorSensorDescription(SensorEntityDescription):
value_fn=lambda coord: coord.total_failures,
entity_category=EntityCategory.DIAGNOSTIC,
),
GivEnergyCoordinatorSensorDescription(
key="partial_failures",
name="Partial Refresh Failures",
# Polls that returned data but had some register reads fail. The
# attributes name the device(s) that dropped — the only UI signal of a
# flaky device, since its entities stay available with frozen values.
state_class=SensorStateClass.TOTAL_INCREASING,
value_fn=lambda coord: coord.partial_failures,
attributes_fn=_partial_failure_attributes,
entity_category=EntityCategory.DIAGNOSTIC,
),
)


Expand Down Expand Up @@ -1053,3 +1090,9 @@ def available(self) -> bool:
@property
def native_value(self) -> Any:
return self.entity_description.value_fn(self.coordinator)

@property
def extra_state_attributes(self) -> dict[str, Any] | None:
if self.entity_description.attributes_fn is None:
return None
return self.entity_description.attributes_fn(self.coordinator)
Comment thread
dewet22 marked this conversation as resolved.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version = "0.0.0"
description = "Home Assistant custom component for GivEnergy inverters via local Modbus TCP"
requires-python = ">=3.14.2"
dependencies = [
"givenergy-modbus>=2.1.0a4,<3.0.0",
"givenergy-modbus>=2.1.0a5,<3.0.0",
]

[dependency-groups]
Expand Down
19 changes: 17 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,25 @@ def mock_plant(mock_inverter, mock_battery) -> MagicMock:

@pytest.fixture
def mock_client(mock_plant) -> AsyncMock:
client = AsyncMock()
# spec_set deliberately omits the deprecated refresh_plant() so any lingering
# call to it fails fast (AttributeError) rather than silently auto-mocking —
# guards the #125 migration against regressions.
client = AsyncMock(
spec_set=[
"connected",
"plant",
"refresh",
"load_config",
"connect",
"detect",
"close",
"one_shot_command",
]
)
client.connected = True
client.plant = mock_plant
client.refresh_plant = AsyncMock(return_value=mock_plant)
client.refresh = AsyncMock(return_value=mock_plant)
client.load_config = AsyncMock(return_value=mock_plant)
client.connect = AsyncMock()
client.detect = AsyncMock()
client.close = AsyncMock()
Expand Down
Loading
Loading