Skip to content

Commit

Permalink
feat: Conditionally remove networkd online dependency on Ubuntu
Browse files Browse the repository at this point in the history
Traditionally, cloud-init-network.service (previously
cloud-init.service) waited for network connectivity (via systemd
service ordering) before running. This has caused
cloud-init-network.service to block boot for a significant amount of
time. For the vast majority of boots, this network connectivity
isn't required.

This commit removes the ordering
After=systemd-networkd-wait-online.service, but checks the datasource
and user data in the init-local timeframe to see if network
connectivity will be necessary in the init network timeframe.
If so, when the init network service starts, it will call
systemd-networkd-wait-online manually in the same manner that the
systemd-networkd-wait-online.service does to wait for network
connectivity.

This commit affects Ubuntu only due to the various number of service
orderings and network renderers possible, along with the downstream
synchronization needed. However, a new overrideable method in the
Distro class should make this optimization trivial to implement for
any other distro.
  • Loading branch information
TheRealFalcon committed Oct 3, 2024
1 parent dc3cfde commit 82ee651
Show file tree
Hide file tree
Showing 11 changed files with 381 additions and 26 deletions.
81 changes: 80 additions & 1 deletion cloudinit/cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import traceback
import logging
import yaml
from typing import Tuple, Callable
from typing import Optional, Tuple, Callable, Union

from cloudinit import netinfo
from cloudinit import signal_handler
Expand All @@ -39,6 +39,7 @@
from cloudinit.config import cc_set_hostname
from cloudinit.config.modules import Modules
from cloudinit.config.schema import validate_cloudconfig_schema
from cloudinit.lifecycle import log_with_downgradable_level
from cloudinit.reporting import events
from cloudinit.settings import (
PER_INSTANCE,
Expand Down Expand Up @@ -334,6 +335,68 @@ def _should_bring_up_interfaces(init, args):
return not args.local


def _should_wait_via_cloud_config(
raw_config: Optional[Union[str, bytes]]
) -> bool:
if not raw_config:
return False

# If our header is anything other than #cloud-config, wait
possible_header: Union[bytes, str] = raw_config.strip()[:13]
if isinstance(possible_header, str):
decoded_header = possible_header
elif isinstance(possible_header, bytes):
try:
decoded_header = possible_header.decode("utf-8")
except UnicodeDecodeError:
return True
if not decoded_header.startswith("#cloud-config"):
return True

try:
userdata_yaml = yaml.safe_load(raw_config)
except Exception as e:
log_with_downgradable_level(
logger=LOG,
version="24.4",
requested_level=logging.WARNING,
msg="Unexpected failure parsing userdata: %s",
args=e,
)
return True

# These all have the potential to require network access, so we should wait
if "write_files" in userdata_yaml:
return any("source" in item for item in userdata_yaml["write_files"])
return bool(
userdata_yaml.get("bootcmd")
or userdata_yaml.get("random_seed", {}).get("command")
or userdata_yaml.get("mounts")
)


def _should_wait_on_network(datasource: Optional[sources.DataSource]) -> bool:
"""Determine if we should wait on network connectivity for cloud-init.
We need to wait if:
- We have no datasource
- We have user data that is anything other than cloud-config
- This can likely be further optimized in the future to include
other user data types
- We have user data that requires network access
"""
if not datasource:
return True
return any(
_should_wait_via_cloud_config(config)
for config in [
datasource.get_userdata_raw(),
datasource.get_vendordata_raw(),
datasource.get_vendordata2_raw(),
]
)


def main_init(name, args):
deps = [sources.DEP_FILESYSTEM, sources.DEP_NETWORK]
if args.local:
Expand Down Expand Up @@ -411,6 +474,9 @@ def main_init(name, args):
mode = sources.DSMODE_LOCAL if args.local else sources.DSMODE_NETWORK

if mode == sources.DSMODE_NETWORK:
if os.path.exists(init.paths.get_runpath(".wait-on-network")):
LOG.debug("Will wait for network connectivity before continuing")
init.distro.wait_for_network()
existing = "trust"
sys.stderr.write("%s\n" % (netinfo.debug_info()))
else:
Expand Down Expand Up @@ -478,9 +544,22 @@ def main_init(name, args):
# dhcp clients to advertize this hostname to any DDNS services
# LP: #1746455.
_maybe_set_hostname(init, stage="local", retry_stage="network")

init.apply_network_config(bring_up=bring_up_interfaces)

if mode == sources.DSMODE_LOCAL:
if _should_wait_on_network(init.datasource):
LOG.debug(
"Network connectivity determined necessary for "
"cloud-init's network stage"
)
util.write_file(init.paths.get_runpath(".wait-on-network"), "")
else:
LOG.debug(
"Network connectivity determined unnecessary for "
"cloud-init's network stage"
)

if init.datasource.dsmode != mode:
LOG.debug(
"[%s] Exiting. datasource %s not in local mode.",
Expand Down
13 changes: 13 additions & 0 deletions cloudinit/distros/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,19 @@ def device_part_info(devpath: str) -> tuple:
# name in /dev/
return diskdevpath, ptnum

def wait_for_network(self):
"""Ensure that cloud-init has network connectivity.
For most distros, this is a no-op as cloud-init's network service is
ordered in boot to start after network connectivity has been achieved.
As an optimization, distros may opt to order cloud-init's network
service immediately after cloud-init's local service, and only
require network connectivity if it has been deemed necessary.
This method is a hook for distros to implement this optimization.
It is called during cloud-init's network stage if it was determined
that network connectivity is necessary in cloud-init's network stage.
"""


def _apply_hostname_transformations_to_url(url: str, transformations: list):
"""
Expand Down
5 changes: 5 additions & 0 deletions cloudinit/distros/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from cloudinit.distros.package_management.apt import Apt
from cloudinit.distros.package_management.package_manager import PackageManager
from cloudinit.distros.parsers.hostname import HostnameConf
from cloudinit.net import netplan
from cloudinit.net.netplan import CLOUDINIT_NETPLAN_FILE

LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -220,6 +221,10 @@ def set_keymap(self, layout: str, model: str, variant: str, options: str):
# be needed
self.manage_service("restart", "console-setup")

def wait_for_network(self):
"""Ensure that cloud-init's network service has network connectivity"""
netplan.wait_for_network()


def _maybe_remove_legacy_eth0(path="/etc/network/interfaces.d/eth0.cfg"):
"""Ubuntu cloud images previously included a 'eth0.cfg' that had
Expand Down
1 change: 1 addition & 0 deletions cloudinit/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def __init__(self, path_cfgs: dict, ds=None):
"vendor_scripts": "scripts/vendor",
"warnings": "warnings",
"hotplug.enabled": "hotplug.enabled",
".wait-on-network": ".wait-on-network",
}
# Set when a datasource becomes active
self.datasource = ds
Expand Down
30 changes: 30 additions & 0 deletions cloudinit/net/netplan.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
IPV6_DYNAMIC_TYPES,
SYS_CLASS_NET,
get_devicelist,
network_manager,
renderer,
should_add_gateway_onlink_flag,
subnet_is_ipv6,
Expand Down Expand Up @@ -570,3 +571,32 @@ def available(target=None):
if not subp.which(p, search=search, target=target):
return False
return True


def wait_for_network() -> None:
"""On networkd systems, manually wait for systemd-networkd-wait-online"""
# At the moment, this is only supported using the networkd renderer.
if network_manager.available():
LOG.debug("NetworkManager is enabled, skipping networkd wait")
return
wait_online_def: str = subp.subp(
["systemctl", "cat", "systemd-networkd-wait-online.service"]
).stdout

# We need to extract the ExecStart= lines from the service definition.
# If we come across an ExecStart= line that is empty, that clears any
# previously found commands, which we should expect from the drop-in.
# Since the service is a oneshot, we can have multiple ExecStart= lines
# and systemd runs them in parallel. We'll run them serially since
# there's really no gain for us in running them in parallel.
wait_commands: List[List[str]] = []
for line in wait_online_def.splitlines():
if line.startswith("ExecStart="):
command_str = line.split("=", 1)[1].strip()
if not command_str:
wait_commands.clear()
else:
wait_commands.append(command_str.split())

for command in wait_commands:
subp.subp(command)
2 changes: 2 additions & 0 deletions systemd/cloud-init-network.service.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ Wants=cloud-init-local.service
Wants=sshd-keygen.service
Wants=sshd.service
After=cloud-init-local.service
{% if variant not in ["debian", "ubuntu"] %}
After=systemd-networkd-wait-online.service
{% endif %}
{% if variant in ["ubuntu", "unknown", "debian"] %}
After=networking.service
{% endif %}
Expand Down
4 changes: 3 additions & 1 deletion tests/integration_tests/datasources/test_nocloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
override_kernel_command_line,
verify_clean_boot,
verify_clean_log,
wait_online_called,
)

VENDOR_DATA = """\
#cloud-config
runcmd:
bootcmd:
- touch /var/tmp/seeded_vendordata_test_file
"""

Expand Down Expand Up @@ -99,6 +100,7 @@ def test_nocloud_seedfrom_vendordata(client: IntegrationInstance):
client.restart()
assert client.execute("cloud-init status").ok
assert "seeded_vendordata_test_file" in client.execute("ls /var/tmp")
assert wait_online_called(client.execute("cat /var/log/cloud-init.log"))


SMBIOS_USERDATA = """\
Expand Down
63 changes: 41 additions & 22 deletions tests/integration_tests/modules/test_boothook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import pytest

from tests.integration_tests.instances import IntegrationInstance
from tests.integration_tests.util import verify_clean_boot, verify_clean_log
from tests.integration_tests.util import (
verify_clean_boot,
verify_clean_log,
wait_online_called,
)

USER_DATA = """\
## template: jinja
Expand All @@ -18,24 +22,39 @@


@pytest.mark.user_data(USER_DATA)
def test_boothook_header_runs_part_per_instance(client: IntegrationInstance):
"""Test boothook handling creates a script that runs per-boot.
Streams stderr and stdout are directed to /var/log/cloud-init-output.log.
"""
instance_id = client.instance.execute("cloud-init query instance-id")
RE_BOOTHOOK = f"BOOTHOOK: {instance_id}: is called every boot"
log = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log)
verify_clean_boot(client)
output = client.read_from_file("/boothook.txt")
assert 1 == len(re.findall(RE_BOOTHOOK, output))
client.restart()
output = client.read_from_file("/boothook.txt")
assert 2 == len(re.findall(RE_BOOTHOOK, output))
output_log = client.read_from_file("/var/log/cloud-init-output.log")
expected_msgs = [
"BOOTHOOKstdout",
"boothooks/part-001: 3: BOOTHOOK/0: not found",
]
for msg in expected_msgs:
assert msg in output_log
class TestBoothook:
def test_boothook_header_runs_part_per_instance(
self,
class_client: IntegrationInstance,
):
"""Test boothook handling creates a script that runs per-boot.
Streams stderr and stdout are directed to
/var/log/cloud-init-output.log.
"""
client = class_client
instance_id = client.instance.execute("cloud-init query instance-id")
RE_BOOTHOOK = f"BOOTHOOK: {instance_id}: is called every boot"
log = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log)
verify_clean_boot(client)
output = client.read_from_file("/boothook.txt")
assert 1 == len(re.findall(RE_BOOTHOOK, output))
client.restart()
output = client.read_from_file("/boothook.txt")
assert 2 == len(re.findall(RE_BOOTHOOK, output))
output_log = client.read_from_file("/var/log/cloud-init-output.log")
expected_msgs = [
"BOOTHOOKstdout",
"boothooks/part-001: 3: BOOTHOOK/0: not found",
]
for msg in expected_msgs:
assert msg in output_log

def test_boothook_waits_for_network(
self, class_client: IntegrationInstance
):
"""Test boothook handling waits for network before running."""
client = class_client
assert wait_online_called(
client.read_from_file("/var/log/cloud-init.log")
)
8 changes: 8 additions & 0 deletions tests/integration_tests/modules/test_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
verify_clean_boot,
verify_clean_log,
verify_ordered_items_in_text,
wait_online_called,
)

USER_DATA = """\
Expand Down Expand Up @@ -541,6 +542,13 @@ def test_unicode(self, class_client: IntegrationInstance):
client = class_client
assert "💩" == client.read_from_file("/var/tmp/unicode_data")

@pytest.mark.skipif(not IS_UBUNTU, reason="Ubuntu-only behavior")
def test_networkd_wait_online(self, class_client: IntegrationInstance):
client = class_client
assert not wait_online_called(
client.read_from_file("/var/log/cloud-init.log")
)


@pytest.mark.user_data(USER_DATA)
class TestCombinedNoCI:
Expand Down
10 changes: 9 additions & 1 deletion tests/integration_tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@

from cloudinit.subp import subp
from tests.integration_tests.integration_settings import PLATFORM
from tests.integration_tests.releases import CURRENT_RELEASE, NOBLE
from tests.integration_tests.releases import (
CURRENT_RELEASE,
NETWORK_SERVICE_FILE,
NOBLE,
)

LOG = logging.getLogger("integration_testing.util")

Expand Down Expand Up @@ -611,3 +615,7 @@ def push_and_enable_systemd_unit(
client.write_to_file(service_filename, content)
client.execute(f"chmod 0644 {service_filename}", use_sudo=True)
client.execute(f"systemctl enable {unit_name}", use_sudo=True)


def wait_online_called(log: str) -> bool:
return "Running command ['/lib/systemd/systemd-networkd-wait-online" in log
Loading

0 comments on commit 82ee651

Please sign in to comment.