From 82ee65163c4fa975ebdb3ec058356af38e3092d1 Mon Sep 17 00:00:00 2001 From: James Falcon Date: Wed, 2 Oct 2024 14:01:23 -0500 Subject: [PATCH] feat: Conditionally remove networkd online dependency on Ubuntu Traditionally, cloud-init-network.service (previously cloud-init.service) waited for network connectivity (via systemd service ordering) before running. This has caused cloud-init-network.service to block boot for a significant amount of time. For the vast majority of boots, this network connectivity isn't required. This commit removes the ordering After=systemd-networkd-wait-online.service, but checks the datasource and user data in the init-local timeframe to see if network connectivity will be necessary in the init network timeframe. If so, when the init network service starts, it will call systemd-networkd-wait-online manually in the same manner that the systemd-networkd-wait-online.service does to wait for network connectivity. This commit affects Ubuntu only due to the various number of service orderings and network renderers possible, along with the downstream synchronization needed. However, a new overrideable method in the Distro class should make this optimization trivial to implement for any other distro. --- cloudinit/cmd/main.py | 81 +++++++- cloudinit/distros/__init__.py | 13 ++ cloudinit/distros/debian.py | 5 + cloudinit/helpers.py | 1 + cloudinit/net/netplan.py | 30 +++ systemd/cloud-init-network.service.tmpl | 2 + .../datasources/test_nocloud.py | 4 +- .../modules/test_boothook.py | 63 ++++-- .../modules/test_combined.py | 8 + tests/integration_tests/util.py | 10 +- tests/unittests/cmd/test_main.py | 190 +++++++++++++++++- 11 files changed, 381 insertions(+), 26 deletions(-) diff --git a/cloudinit/cmd/main.py b/cloudinit/cmd/main.py index 0a75d7cc502..1c0440bafa0 100644 --- a/cloudinit/cmd/main.py +++ b/cloudinit/cmd/main.py @@ -19,7 +19,7 @@ import traceback import logging import yaml -from typing import Tuple, Callable +from typing import Optional, Tuple, Callable, Union from cloudinit import netinfo from cloudinit import signal_handler @@ -39,6 +39,7 @@ from cloudinit.config import cc_set_hostname from cloudinit.config.modules import Modules from cloudinit.config.schema import validate_cloudconfig_schema +from cloudinit.lifecycle import log_with_downgradable_level from cloudinit.reporting import events from cloudinit.settings import ( PER_INSTANCE, @@ -334,6 +335,68 @@ def _should_bring_up_interfaces(init, args): return not args.local +def _should_wait_via_cloud_config( + raw_config: Optional[Union[str, bytes]] +) -> bool: + if not raw_config: + return False + + # If our header is anything other than #cloud-config, wait + possible_header: Union[bytes, str] = raw_config.strip()[:13] + if isinstance(possible_header, str): + decoded_header = possible_header + elif isinstance(possible_header, bytes): + try: + decoded_header = possible_header.decode("utf-8") + except UnicodeDecodeError: + return True + if not decoded_header.startswith("#cloud-config"): + return True + + try: + userdata_yaml = yaml.safe_load(raw_config) + except Exception as e: + log_with_downgradable_level( + logger=LOG, + version="24.4", + requested_level=logging.WARNING, + msg="Unexpected failure parsing userdata: %s", + args=e, + ) + return True + + # These all have the potential to require network access, so we should wait + if "write_files" in userdata_yaml: + return any("source" in item for item in userdata_yaml["write_files"]) + return bool( + userdata_yaml.get("bootcmd") + or userdata_yaml.get("random_seed", {}).get("command") + or userdata_yaml.get("mounts") + ) + + +def _should_wait_on_network(datasource: Optional[sources.DataSource]) -> bool: + """Determine if we should wait on network connectivity for cloud-init. + + We need to wait if: + - We have no datasource + - We have user data that is anything other than cloud-config + - This can likely be further optimized in the future to include + other user data types + - We have user data that requires network access + """ + if not datasource: + return True + return any( + _should_wait_via_cloud_config(config) + for config in [ + datasource.get_userdata_raw(), + datasource.get_vendordata_raw(), + datasource.get_vendordata2_raw(), + ] + ) + + def main_init(name, args): deps = [sources.DEP_FILESYSTEM, sources.DEP_NETWORK] if args.local: @@ -411,6 +474,9 @@ def main_init(name, args): mode = sources.DSMODE_LOCAL if args.local else sources.DSMODE_NETWORK if mode == sources.DSMODE_NETWORK: + if os.path.exists(init.paths.get_runpath(".wait-on-network")): + LOG.debug("Will wait for network connectivity before continuing") + init.distro.wait_for_network() existing = "trust" sys.stderr.write("%s\n" % (netinfo.debug_info())) else: @@ -478,9 +544,22 @@ def main_init(name, args): # dhcp clients to advertize this hostname to any DDNS services # LP: #1746455. _maybe_set_hostname(init, stage="local", retry_stage="network") + init.apply_network_config(bring_up=bring_up_interfaces) if mode == sources.DSMODE_LOCAL: + if _should_wait_on_network(init.datasource): + LOG.debug( + "Network connectivity determined necessary for " + "cloud-init's network stage" + ) + util.write_file(init.paths.get_runpath(".wait-on-network"), "") + else: + LOG.debug( + "Network connectivity determined unnecessary for " + "cloud-init's network stage" + ) + if init.datasource.dsmode != mode: LOG.debug( "[%s] Exiting. datasource %s not in local mode.", diff --git a/cloudinit/distros/__init__.py b/cloudinit/distros/__init__.py index e65cbfb5d89..d9dfaf57bf8 100644 --- a/cloudinit/distros/__init__.py +++ b/cloudinit/distros/__init__.py @@ -1574,6 +1574,19 @@ def device_part_info(devpath: str) -> tuple: # name in /dev/ return diskdevpath, ptnum + def wait_for_network(self): + """Ensure that cloud-init has network connectivity. + + For most distros, this is a no-op as cloud-init's network service is + ordered in boot to start after network connectivity has been achieved. + As an optimization, distros may opt to order cloud-init's network + service immediately after cloud-init's local service, and only + require network connectivity if it has been deemed necessary. + This method is a hook for distros to implement this optimization. + It is called during cloud-init's network stage if it was determined + that network connectivity is necessary in cloud-init's network stage. + """ + def _apply_hostname_transformations_to_url(url: str, transformations: list): """ diff --git a/cloudinit/distros/debian.py b/cloudinit/distros/debian.py index cef6e1fb8c0..0c3f659a3a9 100644 --- a/cloudinit/distros/debian.py +++ b/cloudinit/distros/debian.py @@ -15,6 +15,7 @@ from cloudinit.distros.package_management.apt import Apt from cloudinit.distros.package_management.package_manager import PackageManager from cloudinit.distros.parsers.hostname import HostnameConf +from cloudinit.net import netplan from cloudinit.net.netplan import CLOUDINIT_NETPLAN_FILE LOG = logging.getLogger(__name__) @@ -220,6 +221,10 @@ def set_keymap(self, layout: str, model: str, variant: str, options: str): # be needed self.manage_service("restart", "console-setup") + def wait_for_network(self): + """Ensure that cloud-init's network service has network connectivity""" + netplan.wait_for_network() + def _maybe_remove_legacy_eth0(path="/etc/network/interfaces.d/eth0.cfg"): """Ubuntu cloud images previously included a 'eth0.cfg' that had diff --git a/cloudinit/helpers.py b/cloudinit/helpers.py index 470a5b2013f..2470afafc75 100644 --- a/cloudinit/helpers.py +++ b/cloudinit/helpers.py @@ -351,6 +351,7 @@ def __init__(self, path_cfgs: dict, ds=None): "vendor_scripts": "scripts/vendor", "warnings": "warnings", "hotplug.enabled": "hotplug.enabled", + ".wait-on-network": ".wait-on-network", } # Set when a datasource becomes active self.datasource = ds diff --git a/cloudinit/net/netplan.py b/cloudinit/net/netplan.py index 50af17694d5..10b661170f6 100644 --- a/cloudinit/net/netplan.py +++ b/cloudinit/net/netplan.py @@ -13,6 +13,7 @@ IPV6_DYNAMIC_TYPES, SYS_CLASS_NET, get_devicelist, + network_manager, renderer, should_add_gateway_onlink_flag, subnet_is_ipv6, @@ -570,3 +571,32 @@ def available(target=None): if not subp.which(p, search=search, target=target): return False return True + + +def wait_for_network() -> None: + """On networkd systems, manually wait for systemd-networkd-wait-online""" + # At the moment, this is only supported using the networkd renderer. + if network_manager.available(): + LOG.debug("NetworkManager is enabled, skipping networkd wait") + return + wait_online_def: str = subp.subp( + ["systemctl", "cat", "systemd-networkd-wait-online.service"] + ).stdout + + # We need to extract the ExecStart= lines from the service definition. + # If we come across an ExecStart= line that is empty, that clears any + # previously found commands, which we should expect from the drop-in. + # Since the service is a oneshot, we can have multiple ExecStart= lines + # and systemd runs them in parallel. We'll run them serially since + # there's really no gain for us in running them in parallel. + wait_commands: List[List[str]] = [] + for line in wait_online_def.splitlines(): + if line.startswith("ExecStart="): + command_str = line.split("=", 1)[1].strip() + if not command_str: + wait_commands.clear() + else: + wait_commands.append(command_str.split()) + + for command in wait_commands: + subp.subp(command) diff --git a/systemd/cloud-init-network.service.tmpl b/systemd/cloud-init-network.service.tmpl index bed96a855ad..6c431e1f91a 100644 --- a/systemd/cloud-init-network.service.tmpl +++ b/systemd/cloud-init-network.service.tmpl @@ -9,7 +9,9 @@ Wants=cloud-init-local.service Wants=sshd-keygen.service Wants=sshd.service After=cloud-init-local.service +{% if variant not in ["debian", "ubuntu"] %} After=systemd-networkd-wait-online.service +{% endif %} {% if variant in ["ubuntu", "unknown", "debian"] %} After=networking.service {% endif %} diff --git a/tests/integration_tests/datasources/test_nocloud.py b/tests/integration_tests/datasources/test_nocloud.py index c44418c3e08..0929e77914b 100644 --- a/tests/integration_tests/datasources/test_nocloud.py +++ b/tests/integration_tests/datasources/test_nocloud.py @@ -15,11 +15,12 @@ override_kernel_command_line, verify_clean_boot, verify_clean_log, + wait_online_called, ) VENDOR_DATA = """\ #cloud-config -runcmd: +bootcmd: - touch /var/tmp/seeded_vendordata_test_file """ @@ -99,6 +100,7 @@ def test_nocloud_seedfrom_vendordata(client: IntegrationInstance): client.restart() assert client.execute("cloud-init status").ok assert "seeded_vendordata_test_file" in client.execute("ls /var/tmp") + assert wait_online_called(client.execute("cat /var/log/cloud-init.log")) SMBIOS_USERDATA = """\ diff --git a/tests/integration_tests/modules/test_boothook.py b/tests/integration_tests/modules/test_boothook.py index e9e1e796d17..4dbe266e2d2 100644 --- a/tests/integration_tests/modules/test_boothook.py +++ b/tests/integration_tests/modules/test_boothook.py @@ -4,7 +4,11 @@ import pytest from tests.integration_tests.instances import IntegrationInstance -from tests.integration_tests.util import verify_clean_boot, verify_clean_log +from tests.integration_tests.util import ( + verify_clean_boot, + verify_clean_log, + wait_online_called, +) USER_DATA = """\ ## template: jinja @@ -18,24 +22,39 @@ @pytest.mark.user_data(USER_DATA) -def test_boothook_header_runs_part_per_instance(client: IntegrationInstance): - """Test boothook handling creates a script that runs per-boot. - Streams stderr and stdout are directed to /var/log/cloud-init-output.log. - """ - instance_id = client.instance.execute("cloud-init query instance-id") - RE_BOOTHOOK = f"BOOTHOOK: {instance_id}: is called every boot" - log = client.read_from_file("/var/log/cloud-init.log") - verify_clean_log(log) - verify_clean_boot(client) - output = client.read_from_file("/boothook.txt") - assert 1 == len(re.findall(RE_BOOTHOOK, output)) - client.restart() - output = client.read_from_file("/boothook.txt") - assert 2 == len(re.findall(RE_BOOTHOOK, output)) - output_log = client.read_from_file("/var/log/cloud-init-output.log") - expected_msgs = [ - "BOOTHOOKstdout", - "boothooks/part-001: 3: BOOTHOOK/0: not found", - ] - for msg in expected_msgs: - assert msg in output_log +class TestBoothook: + def test_boothook_header_runs_part_per_instance( + self, + class_client: IntegrationInstance, + ): + """Test boothook handling creates a script that runs per-boot. + Streams stderr and stdout are directed to + /var/log/cloud-init-output.log. + """ + client = class_client + instance_id = client.instance.execute("cloud-init query instance-id") + RE_BOOTHOOK = f"BOOTHOOK: {instance_id}: is called every boot" + log = client.read_from_file("/var/log/cloud-init.log") + verify_clean_log(log) + verify_clean_boot(client) + output = client.read_from_file("/boothook.txt") + assert 1 == len(re.findall(RE_BOOTHOOK, output)) + client.restart() + output = client.read_from_file("/boothook.txt") + assert 2 == len(re.findall(RE_BOOTHOOK, output)) + output_log = client.read_from_file("/var/log/cloud-init-output.log") + expected_msgs = [ + "BOOTHOOKstdout", + "boothooks/part-001: 3: BOOTHOOK/0: not found", + ] + for msg in expected_msgs: + assert msg in output_log + + def test_boothook_waits_for_network( + self, class_client: IntegrationInstance + ): + """Test boothook handling waits for network before running.""" + client = class_client + assert wait_online_called( + client.read_from_file("/var/log/cloud-init.log") + ) diff --git a/tests/integration_tests/modules/test_combined.py b/tests/integration_tests/modules/test_combined.py index 759c9528e62..5e4369e7881 100644 --- a/tests/integration_tests/modules/test_combined.py +++ b/tests/integration_tests/modules/test_combined.py @@ -30,6 +30,7 @@ verify_clean_boot, verify_clean_log, verify_ordered_items_in_text, + wait_online_called, ) USER_DATA = """\ @@ -541,6 +542,13 @@ def test_unicode(self, class_client: IntegrationInstance): client = class_client assert "💩" == client.read_from_file("/var/tmp/unicode_data") + @pytest.mark.skipif(not IS_UBUNTU, reason="Ubuntu-only behavior") + def test_networkd_wait_online(self, class_client: IntegrationInstance): + client = class_client + assert not wait_online_called( + client.read_from_file("/var/log/cloud-init.log") + ) + @pytest.mark.user_data(USER_DATA) class TestCombinedNoCI: diff --git a/tests/integration_tests/util.py b/tests/integration_tests/util.py index 828dae2e99a..4f728f39a2d 100644 --- a/tests/integration_tests/util.py +++ b/tests/integration_tests/util.py @@ -15,7 +15,11 @@ from cloudinit.subp import subp from tests.integration_tests.integration_settings import PLATFORM -from tests.integration_tests.releases import CURRENT_RELEASE, NOBLE +from tests.integration_tests.releases import ( + CURRENT_RELEASE, + NETWORK_SERVICE_FILE, + NOBLE, +) LOG = logging.getLogger("integration_testing.util") @@ -611,3 +615,7 @@ def push_and_enable_systemd_unit( client.write_to_file(service_filename, content) client.execute(f"chmod 0644 {service_filename}", use_sudo=True) client.execute(f"systemctl enable {unit_name}", use_sudo=True) + + +def wait_online_called(log: str) -> bool: + return "Running command ['/lib/systemd/systemd-networkd-wait-online" in log diff --git a/tests/unittests/cmd/test_main.py b/tests/unittests/cmd/test_main.py index 482bf576930..508e969acb6 100644 --- a/tests/unittests/cmd/test_main.py +++ b/tests/unittests/cmd/test_main.py @@ -3,19 +3,39 @@ import copy import getpass import os +import textwrap from collections import namedtuple from unittest import mock import pytest -from cloudinit import safeyaml +from cloudinit import safeyaml, util from cloudinit.cmd import main +from cloudinit.subp import SubpResult from cloudinit.util import ensure_dir, load_text_file, write_file MyArgs = namedtuple( "MyArgs", "debug files force local reporter subcommand skip_log_setup" ) +FAKE_SERVICE_FILE = """\ +[Unit] +Description=Wait for Network to be Configured + +[Service] +Type=oneshot +ExecStart=/usr/lib/systemd/systemd-networkd-wait-online + +# /run/systemd/system/systemd-networkd-wait-online.service.d/10-netplan.conf +[Unit] +ConditionPathIsSymbolicLink=/run/systemd/generator/network-online.target.wants/systemd-networkd-wait-online.service + +[Service] +ExecStart= +ExecStart=/lib/systemd/systemd-networkd-wait-online -i eth0:degraded +ExecStart=/lib/systemd/systemd-networkd-wait-online --any -o routable -i eth0 +""" # noqa: E501 + class TestMain: @pytest.fixture(autouse=True) @@ -139,6 +159,174 @@ def test_main_sys_argv( main.main() m_clean_get_parser.assert_called_once() + @pytest.mark.parametrize( + "ds,userdata,expected", + [ + # If we have no datasource, wait regardless + (None, None, True), + (None, "#!/bin/bash\n - echo hello", True), + # Empty user data shouldn't wait + (mock.Mock(), "", False), + # Bootcmd always wait + (mock.Mock(), "#cloud-config\nbootcmd:\n - echo hello", True), + # Bytes are valid too + (mock.Mock(), b"#cloud-config\nbootcmd:\n - echo hello", True), + # write_files with 'source' wait + ( + mock.Mock(), + textwrap.dedent( + """\ + #cloud-config + write_files: + - source: + uri: http://example.com + headers: + Authorization: Basic stuff + User-Agent: me + """ + ), + True, + ), + # write_files without 'source' don't wait + ( + mock.Mock(), + textwrap.dedent( + """\ + #cloud-config + write_files: + - content: hello + encoding: b64 + owner: root:root + path: /etc/sysconfig/selinux + permissions: '0644' + """ + ), + False, + ), + # random_seed with 'command' wait + ( + mock.Mock(), + "#cloud-config\nrandom_seed:\n command: true", + True, + ), + # random_seed without 'command' no wait + ( + mock.Mock(), + textwrap.dedent( + """\ + #cloud-config + random_seed: + data: 4 + encoding: raw + file: /dev/urandom + """ + ), + False, + ), + # mounts always wait + ( + mock.Mock(), + "#cloud-config\nmounts:\n - [ /dev/sdb, /mnt, ext4 ]", + True, + ), + # Not parseable as yaml + (mock.Mock(), "#cloud-config\nbootcmd:\necho hello", True), + # Non-cloud-config + (mock.Mock(), "#!/bin/bash\n - echo hello", True), + # Something that won't decode to utf-8 + (mock.Mock(), os.urandom(100), True), + # Something small that shouldn't decode to utf-8 + (mock.Mock(), os.urandom(5), True), + ], + ) + def test_should_wait_on_network(self, ds, userdata, expected): + if ds: + ds.get_userdata_raw = mock.Mock(return_value=userdata) + ds.get_vendordata_raw = mock.Mock(return_value=None) + ds.get_vendordata2_raw = mock.Mock(return_value=None) + assert main._should_wait_on_network(ds) is expected + + # Here we rotate our configs to ensure that any of userdata, + # vendordata, or vendordata2 can be the one that causes us to wait. + for _ in range(2): + if ds: + ( + ds.get_userdata_raw, + ds.get_vendordata_raw, + ds.get_vendordata2_raw, + ) = ( + ds.get_vendordata_raw, + ds.get_vendordata2_raw, + ds.get_userdata_raw, + ) + assert main._should_wait_on_network(ds) is expected + + @pytest.mark.parametrize( + "distro,should_wait,expected_add_wait", + [ + ("ubuntu", True, True), + ("ubuntu", False, False), + ("debian", True, True), + ("debian", False, False), + ("centos", True, False), + ("rhel", False, False), + ("fedora", True, False), + ("suse", False, False), + ("gentoo", True, False), + ("arch", False, False), + ("alpine", False, False), + ], + ) + def test_distro_wait_for_network( + self, + distro, + should_wait, + expected_add_wait, + cloud_cfg, + mocker, + fake_filesystem, + ): + def fake_subp(*args, **kwargs): + if args == ( + ["systemctl", "cat", "systemd-networkd-wait-online.service"], + ): + return SubpResult(FAKE_SERVICE_FILE, "") + return SubpResult("", "") + + m_nm = mocker.patch( + "cloudinit.net.network_manager.available", return_value=False + ) + m_subp = mocker.patch("cloudinit.subp.subp", side_effect=fake_subp) + if should_wait: + util.write_file(".wait-on-network", "") + + cfg, cloud_cfg_file = cloud_cfg + cfg["system_info"]["distro"] = distro + write_file(cloud_cfg_file, safeyaml.dumps(cfg)) + cmdargs = MyArgs( + debug=False, + files=None, + force=False, + local=False, + reporter=None, + subcommand="init", + skip_log_setup=False, + ) + main.main_init("init", cmdargs) + expected_subps = [ + "/lib/systemd/systemd-networkd-wait-online -i eth0:degraded", + "/lib/systemd/systemd-networkd-wait-online --any -o routable -i eth0", # noqa: E501 + ] + if expected_add_wait: + m_nm.assert_called_once() + for expected_subp in expected_subps: + assert ( + mock.call(expected_subp.split()) in m_subp.call_args_list + ) + else: + m_nm.assert_not_called() + m_subp.assert_not_called() + class TestShouldBringUpInterfaces: @pytest.mark.parametrize(