From 19def8547f5c33ed256fc092d89938b11298ff3b Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@koston.org>
Date: Wed, 7 Oct 2020 07:52:48 -0500
Subject: [PATCH] Reduce orm overhead by grouping object expiration

While #40982 solved the performance overhead of expiring every time
we commit the event session, it caused a regression which was
fixed in #41349.

Ideally we could avoid the overhead of expiring objects on commit
since we are never going to use them again by using expunge after
the commit to remove the objects we no longer need. Unfortunately
this causes sqlalchemy to spend quite a bit of time sorted out
state when adding new objects to the event session after an
expunge_all so it wasn't a viable option.

As not expring the objects causes unexpected side effects, we can
limit the impact of having to expring them by only doing the expire
every 120 commits (by default every 2 minutes) instead of every commit
(by default every second). This works because the expire operation
itself is expensive reguardless of the number of objects it is expiring.

To verify this approach does not leak memory, a scale test integration
was created (https://github.com/bdraco/scaletest) and run for hours
along with an objgraph dump setup every 30 seconds to verify States
and Events were being properly disposed of
(https://mg.pov.lt/objgraph/objgraph.html#locating-and-filtering-objects)

In testing this took a machine running the scaletest integration
from 100% cpu to 4% cpu
---
 homeassistant/components/recorder/__init__.py | 36 ++++++++++++++-----
 tests/components/recorder/common.py           |  8 ++---
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/homeassistant/components/recorder/__init__.py b/homeassistant/components/recorder/__init__.py
index 8e6a251f660d47..3736ecaaf29d89 100644
--- a/homeassistant/components/recorder/__init__.py
+++ b/homeassistant/components/recorder/__init__.py
@@ -58,8 +58,13 @@
 DEFAULT_DB_INTEGRITY_CHECK = True
 DEFAULT_DB_MAX_RETRIES = 10
 DEFAULT_DB_RETRY_WAIT = 3
+DEFAULT_COMMIT_INTERVAL = 1
 KEEPALIVE_TIME = 30
 
+# Controls how often we clean up
+# States and Events objects
+EXPIRE_AFTER_COMMITS = 120
+
 CONF_AUTO_PURGE = "auto_purge"
 CONF_DB_URL = "db_url"
 CONF_DB_MAX_RETRIES = "db_max_retries"
@@ -91,9 +96,9 @@
                         vol.Coerce(int), vol.Range(min=0)
                     ),
                     vol.Optional(CONF_DB_URL): cv.string,
-                    vol.Optional(CONF_COMMIT_INTERVAL, default=1): vol.All(
-                        vol.Coerce(int), vol.Range(min=0)
-                    ),
+                    vol.Optional(
+                        CONF_COMMIT_INTERVAL, default=DEFAULT_COMMIT_INTERVAL
+                    ): vol.All(vol.Coerce(int), vol.Range(min=0)),
                     vol.Optional(
                         CONF_DB_MAX_RETRIES, default=DEFAULT_DB_MAX_RETRIES
                     ): cv.positive_int,
@@ -238,6 +243,7 @@ def __init__(
         self.exclude_t = exclude_t
 
         self._timechanges_seen = 0
+        self._commits_without_expire = 0
         self._keepalive_count = 0
         self._old_states = {}
         self._pending_expunge = []
@@ -345,6 +351,7 @@ def async_purge(now):
             )
 
         self.event_session = self.get_session()
+        self.event_session.expire_on_commit = False
         # Use a session for the event read loop
         # with a commit every time the event time
         # has changed. This reduces the disk io.
@@ -485,24 +492,35 @@ def _reopen_event_session(self):
 
         try:
             self.event_session = self.get_session()
+            self.event_session.expire_on_commit = False
         except Exception as err:  # pylint: disable=broad-except
             # Must catch the exception to prevent the loop from collapsing
             _LOGGER.exception("Error while creating new event session: %s", err)
 
     def _commit_event_session(self):
+        self._commits_without_expire += 1
+
         try:
-            self.event_session.flush()
-            for dbstate in self._pending_expunge:
-                # Expunge the state so its not expired
-                # until we use it later for dbstate.old_state
-                self.event_session.expunge(dbstate)
-            self._pending_expunge = []
+            if self._pending_expunge:
+                self.event_session.flush()
+                for dbstate in self._pending_expunge:
+                    # Expunge the state so its not expired
+                    # until we use it later for dbstate.old_state
+                    self.event_session.expunge(dbstate)
+                self._pending_expunge = []
             self.event_session.commit()
         except Exception as err:
             _LOGGER.error("Error executing query: %s", err)
             self.event_session.rollback()
             raise
 
+        # Expire is an expensive operation (frequently more expensive
+        # than the flush and commit itself) so we only
+        # do it after EXPIRE_AFTER_COMMITS commits
+        if self._commits_without_expire == EXPIRE_AFTER_COMMITS:
+            self._commits_without_expire = 0
+            self.event_session.expire_all()
+
     @callback
     def event_listener(self, event):
         """Listen for new events and put them in the process queue."""
diff --git a/tests/components/recorder/common.py b/tests/components/recorder/common.py
index 4c25771398c46c..1d0e6dbbfa00e5 100644
--- a/tests/components/recorder/common.py
+++ b/tests/components/recorder/common.py
@@ -1,12 +1,12 @@
 """Common test utils for working with recorder."""
 
+from datetime import timedelta
+
 from homeassistant.components import recorder
 from homeassistant.util import dt as dt_util
 
 from tests.common import fire_time_changed
 
-DB_COMMIT_INTERVAL = 50
-
 
 def wait_recording_done(hass):
     """Block till recording is done."""
@@ -18,6 +18,6 @@ def wait_recording_done(hass):
 
 def trigger_db_commit(hass):
     """Force the recorder to commit."""
-    for _ in range(DB_COMMIT_INTERVAL):
+    for _ in range(recorder.DEFAULT_COMMIT_INTERVAL):
         # We only commit on time change
-        fire_time_changed(hass, dt_util.utcnow())
+        fire_time_changed(hass, dt_util.utcnow() + timedelta(seconds=1))