diff --git a/README.md b/README.md index 59333116b..0043b43ac 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,8 @@ Unless your focus is developing the VM-Connector, using the Docker image is easi ``` 2. **Install the Debian Package** + This will also install required some dep and config file, making the setup easier. + Replace `1.2.0` with the latest release version. **On Debian 12 (Bookworm)**: @@ -94,16 +96,27 @@ Unless your focus is developing the VM-Connector, using the Docker image is easi To prevent conflicts, deactivate the system version of aleph-vm by disabling its `systemd` service. ```shell + sudo systemctl stop aleph-vm-supervisor sudo systemctl disable aleph-vm-supervisor.service ``` 4. **Clone the Repository and Set Up a Virtual Environment** - Clone the aleph-vm repository to your development environment. - - Create a virtual environment to manage dependencies. + - Install the system dependencies required to build the python venv and packages - Inside the virtual environment, run: + ```shell + apt install libdbus-1-dev libglib2.0-dev libsystemd-dev python3.12-venv + ``` + + - Create a virtual environment to manage dependencies. It need to be a system site package as we use some dependencies provided via debian packages. Such as network interfaces + ```shell + python3 -m venv --system-site-packages ~/.virtualenvs/aleph-vm + ``` + + Active the virtual environment and install the deps: ```shell + source ~/.virtualenvs/aleph-vm/bin/activate pip install -e . ``` diff --git a/packaging/aleph-vm/etc/haproxy/haproxy-aleph.cfg b/packaging/aleph-vm/etc/haproxy/haproxy-aleph.cfg index f5ee90926..39171e295 100644 --- a/packaging/aleph-vm/etc/haproxy/haproxy-aleph.cfg +++ b/packaging/aleph-vm/etc/haproxy/haproxy-aleph.cfg @@ -113,7 +113,7 @@ backend bk_http # For HTTP - Use Host header http-request set-header Host %[req.hdr(host)] use-server %[var(txn.host),lower,map(/etc/haproxy/http_domains.map)] if { var(txn.host) -m found } - server web1 127.0.0.1:4020 + server fallback_local 127.0.0.1:4020 # Default to fallback to the aleph-vm supervisor # Backend to terminate TLS for fallback (uses internal http frontend) diff --git a/src/aleph/vm/haproxy.py b/src/aleph/vm/haproxy.py index 7ecb0652f..94f1b305a 100644 --- a/src/aleph/vm/haproxy.py +++ b/src/aleph/vm/haproxy.py @@ -270,6 +270,9 @@ def update_haproxy_backends(socket_path, backend_name, map_file_path, weight=1): # Remove servers that are not in the map file servers_to_remove = set(current_servers) - processed_servers + # Do not remove the fallback server + servers_to_remove.discard("fallback_local") + servers_to_remove.discard("web1") if servers_to_remove: logger.info(f"Removing {len(servers_to_remove)} servers no longer in map file") for server_name in servers_to_remove: @@ -283,9 +286,10 @@ def update_haproxy_backends(socket_path, backend_name, map_file_path, weight=1): return True -async def fetch_list() -> list[dict]: +async def fetch_list(domain=None) -> list[dict]: async with aiohttp.ClientSession() as client: - resp = await client.get(url=str(settings.DOMAIN_SERVICE_URL)) + resp = await client.get(url=str(settings.DOMAIN_SERVICE_URL), params={"crn": domain} if domain else None) + # print(resp.real_url) resp.raise_for_status() instances = await resp.json() if len(instances) == 0: @@ -294,7 +298,10 @@ async def fetch_list() -> list[dict]: async def fetch_list_and_update(socket_path, local_vms: list[str], force_update): - instances = await fetch_list() + if settings.DOMAIN_NAME in ("localhost", "vm.example.org"): + logger.info("Skipping domain update because DOMAIN_NAME is not set") + + instances = await fetch_list(settings.DOMAIN_NAME) # filter on local hash instances = [i for i in instances if i["item_hash"] in local_vms] # This should match the config in haproxy.cfg diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 2e38005bb..3486a436a 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -65,6 +65,9 @@ def to_dict(self): return self.__dict__ +LAST_ASSIGNED_HOST_PORT = 24000 + + class VmExecution: """ Control the execution of a VM on a high level. @@ -136,7 +139,11 @@ async def update_port_redirects(self, requested_ports: dict[int, dict[str, bool] for vm_port in redirect_to_add: target = requested_ports[vm_port] - host_port = get_available_host_port(start_port=24000) + host_port = get_available_host_port(start_port=LAST_ASSIGNED_HOST_PORT) + LAST_ASSIGNED_HOST_PORT = host_port + if LAST_ASSIGNED_HOST_PORT > 65535: + LAST_ASSIGNED_HOST_PORT = 24000 + for protocol in SUPPORTED_PROTOCOL_FOR_REDIRECT: if target[protocol]: add_port_redirect_rule(self.vm.vm_id, interface, host_port, vm_port, protocol) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 9d99a7163..5ff2e88ee 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,4 +1,3 @@ -import asyncio import json import re import shutil @@ -17,9 +16,9 @@ async def get_hardware_info(): hw_info = {"cpu": None, "memory": None} for hw in data["children"][0]["children"]: - if hw["id"] == "cpu" or hw["id"].startswith("cpu"): + if hw["id"] == "cpu" or hw["id"].startswith("cpu") and not hw.get("disabled"): hw_info["cpu"] = hw - elif hw["class"] == "memory" and hw["id"] == "memory": + elif hw["class"] == "memory" and hw["id"] == "memory" and not hw.get("disabled"): hw_info["memory"] = hw return hw_info diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 8b745a9fc..b8f3061f3 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -32,6 +32,7 @@ about_execution_records, about_executions, about_login, + debug_haproxy, list_executions, list_executions_v2, notify_allocation, @@ -85,6 +86,7 @@ async def error_middleware(request, handler) -> web.Response: status = exc.status return web.json_response({"error": message}, status=status) except Exception as exc: + logger.exception("Unhandled exception for %s", request.path) message = str(exc) status = 500 return web.json_response({"error": message, "error_type": str(type(exc))}, status=status) @@ -145,6 +147,7 @@ def setup_webapp(pool: VmPool | None): web.post("/control/machine/{ref}/confidential/initialize", operate_confidential_initialize), web.get("/control/machine/{ref}/confidential/measurement", operate_confidential_measurement), web.post("/control/machine/{ref}/confidential/inject_secret", operate_confidential_inject_secret), + web.get("/debug/haproxy", debug_haproxy), # /status APIs are used to check that the VM Orchestrator is running properly web.get("/status/check/fastapi", status_check_fastapi), web.get("/status/check/fastapi/legacy", status_check_fastapi_legacy), diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 00143c686..c16369053 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -18,6 +18,7 @@ from aleph_message.models import InstanceContent, ItemHash, MessageType, PaymentType from pydantic import ValidationError +from aleph.vm import haproxy from aleph.vm.conf import settings from aleph.vm.controllers.firecracker.executable import ( ResourceDownloadError, @@ -148,6 +149,26 @@ async def about_login(request: web.Request) -> web.Response: return web.json_response({"success": False}, status=401) +async def debug_haproxy(request: web.Request) -> web.Response: + """ "Debug endpoint to check the status of HAProxy and the domains mapped to it. + + This is a debug endpoint and should not be used in production. The interface is subject to change. + """ + socket = settings.HAPROXY_SOCKET + import pathlib + + if not pathlib.Path(socket).exists(): + logger.info("HAProxy not running? socket not found, skip domain mapping update") + return web.json_response({"status": "no socket"}, status=http.HTTPStatus) + r: dict = {"status": "ok", "backends": {}} + for backend in haproxy.HAPROXY_BACKENDS: + r["backends"][str(backend["name"])] = haproxy.get_current_backends(socket, backend["name"]) + return web.json_response( + r, + dumps=dumps_for_json, + ) + + @cors_allow_all async def about_executions(request: web.Request) -> web.Response: "/about/executions/details Debugging endpoint with full execution details." diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 0a8bd3e58..d8aec10f6 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -332,6 +332,7 @@ async def load_persistent_executions(self): await execution.fetch_port_redirect_config_and_setup() self.executions[vm_hash] = execution + execution.record = saved_execution else: execution.uuid = saved_execution.uuid await execution.record_usage() diff --git a/tests/supervisor/test_haproxy.py b/tests/supervisor/test_haproxy.py new file mode 100644 index 000000000..ced395939 --- /dev/null +++ b/tests/supervisor/test_haproxy.py @@ -0,0 +1,179 @@ +import pytest + +from aleph.vm import haproxy + +# Sample response for https://api.dns.public.aleph.sh/instances/list +sample_domain_instance_list = [ + { + "name": "api-dev.thronetools.com", + "item_hash": "747b52c712e16642b498f16c4c6e68d5fb00ddbaf8d2a0dc7bd298d33abb9124", + "ipv6": "2a01:240:ad00:2502:3:747b:52c7:12e1", + "ipv4": {"public": "46.247.131.211", "local": "172.16.15.1/32"}, + }, + { + "name": "centurion.cybernetwork.me", + "item_hash": "cefb9373558927d70365746900a410f01e1340ecff0dda93deb672f55bb70ac8", + "ipv6": "2a01:240:ad00:2502:3:cefb:9373:5581", + "ipv4": {"public": "46.247.131.211", "local": "172.16.52.1/32"}, + }, + { + "name": "cms-dev.thronetools.com", + "item_hash": "747b52c712e16642b498f16c4c6e68d5fb00ddbaf8d2a0dc7bd298d33abb9124", + "ipv6": "2a01:240:ad00:2502:3:747b:52c7:12e1", + "ipv4": {"public": "46.247.131.211", "local": "172.16.15.1/32"}, + }, + { + "name": "platform-api.3mera.dev", + "item_hash": "d78e81d99e7468302bdaf82b5ca338b486629cf813384bdc3282e2b8fa7f478f", + "ipv6": "2a01:240:ad00:2502:3:d78e:81d9:9e71", + "ipv4": {"public": "46.247.131.211", "local": "172.16.30.1/32"}, + }, + { + "name": "platform-variants.3mera.dev", + "item_hash": "d78e81d99e7468302bdaf82b5ca338b486629cf813384bdc3282e2b8fa7f478f", + "ipv6": "2a01:240:ad00:2502:3:d78e:81d9:9e71", + "ipv4": {"public": "46.247.131.211", "local": "172.16.30.1/32"}, + }, + { + "name": "platform.3mera.dev", + "item_hash": "d78e81d99e7468302bdaf82b5ca338b486629cf813384bdc3282e2b8fa7f478f", + "ipv6": "2a01:240:ad00:2502:3:d78e:81d9:9e71", + "ipv4": {"public": "46.247.131.211", "local": "172.16.30.1/32"}, + }, + { + "name": "praetorian.cybernetwork.me", + "item_hash": "ec18fa850f6a530a8c0e6a616b0df5def3ab3662eb6feeba8ece580780a86dc6", + "ipv6": "2a01:240:ad00:2502:3:ec18:fa85:f61", + "ipv4": {"public": "46.247.131.211", "local": "172.16.19.1/32"}, + }, + { + "name": "template-frontend.3mera.dev", + "item_hash": "d78e81d99e7468302bdaf82b5ca338b486629cf813384bdc3282e2b8fa7f478f", + "ipv6": "2a01:240:ad00:2502:3:d78e:81d9:9e71", + "ipv4": {"public": "46.247.131.211", "local": "172.16.30.1/32"}, + }, + { + "name": "test-twentysix-cloud.gerardmolina.com", + "item_hash": "31826cf53c655bd25d50f7e371242baf240d4f854372c798a37bb9eb6c562682", + "ipv6": "2a01:240:ad00:2501:3:3182:6cf5:3c61", + "ipv4": {"public": "46.255.204.201", "local": "172.16.5.1/32"}, + }, +] + + +@pytest.fixture +def mock_sample_domain_instance_list(mocker): + mocker.patch("aleph.vm.haproxy.fetch_list", mocker.AsyncMock(return_value=sample_domain_instance_list)) + + +@pytest.mark.asyncio +async def test_fetch_list(mock_sample_domain_instance_list): + list = await haproxy.fetch_list() + assert len(list) == 9 + + +@pytest.fixture +def mock_small_domain_list(mocker): + small_list = [ + { + "name": "echo.agot.be", + "item_hash": "decadecadecadecadecadecadecadecadecadecadecadecadecadecadecadeca", + "ipv6": "2a01:240:ad00:2502:3:747b:52c7:12e1", + "ipv4": {"public": "46.247.131.211", "local": "172.16.4.1/32"}, + } + ] + mocker.patch("aleph.vm.haproxy.fetch_list", mocker.AsyncMock(return_value=small_list)) + + +@pytest.mark.asyncio +async def test_update_map_file(mock_small_domain_list, tmp_path): + map_file_path = tmp_path / "backend.map" + instance_list = await haproxy.fetch_list() + assert instance_list + + haproxy.update_mapfile(instance_list, str(map_file_path), 22) + content = map_file_path.read_text() + assert content == "echo.agot.be 172.16.4.2:22\n" + + +@pytest.fixture +def mock_socket_command(mocker): + commands = [] + existing_servers: list[str] = [] + + def mock_response(socket_path, command): # noqa: ARG001 + commands.append(command) + if "show servers state" in command: + return "1\n# be_id be_name srv_id srv_name srv_addr srv_op_state\n" + "\n".join(mock.existing_servers) + elif "disable server" in command: + return "" + elif "set server" in command: + return "" + elif "enable server" in command: + return "" + return "" + + mock = mocker.patch("aleph.vm.haproxy.send_socket_command", mock_response) + mock.existing_servers = existing_servers + mock.commands = commands + return mock + + +@pytest.mark.asyncio +async def test_update_backend_add_server(mock_socket_command, tmp_path): + map_file_path = tmp_path / "backend.map" + map_file_path.write_text("echo.agot.be 172.16.4.2:22\n") + socket_path = "fakyfake" + + # Run test + haproxy.update_haproxy_backends(socket_path, "test_backend", map_file_path, weight=1) + + # Verify commands + assert mock_socket_command.commands == [ + "show servers state test_backend", + # "disable server test_backend echo.agot.be", + "add server test_backend/echo.agot.be 172.16.4.2:22 weight 1 maxconn 30", + # "set server test_backend echo.agot.be addr 172.16.4.2 port 22", + # "set server test_backend echo.agot.be weight 1", + "enable server test_backend/echo.agot.be", + ] + + +@pytest.mark.asyncio +def test_update_backend_add_server_remove_server(mock_socket_command, tmp_path): + map_file_path = tmp_path / "backend.map" + map_file_path.write_text("echo.agot.be 172.16.4.2:22\n") + socket_path = "fakyfake" + + mock_socket_command.existing_servers = [ + "8 test_backend 1 existing_bk 127.0.0.1 2 0 1 1 683294 1 0 2 0 0 0 0 - 4020 - 0 0 - - 0" + ] + haproxy.update_haproxy_backends(socket_path, "test_backend", map_file_path, weight=1) + + # Verify commands + assert mock_socket_command.commands == [ + "show servers state test_backend", + "add server test_backend/echo.agot.be 172.16.4.2:22 weight 1 maxconn 30", + "enable server test_backend/echo.agot.be", + "set server test_backend/existing_bk state maint", + "del server test_backend/existing_bk", + ] + + +@pytest.mark.asyncio +def test_update_backend_do_no_remove_fallback(mock_socket_command, tmp_path): + map_file_path = tmp_path / "backend.map" + map_file_path.write_text("echo.agot.be 172.16.4.2:22\n") + socket_path = "fakyfake" + + mock_socket_command.existing_servers = [ + "8 test_backend 1 fallback_local 127.0.0.1 2 0 1 1 683294 1 0 2 0 0 0 0 - 4020 - 0 0 - - 0" + ] + haproxy.update_haproxy_backends(socket_path, "test_backend", map_file_path, weight=1) + + # Verify commands + assert mock_socket_command.commands == [ + "show servers state test_backend", + "add server test_backend/echo.agot.be 172.16.4.2:22 weight 1 maxconn 30", + "enable server test_backend/echo.agot.be", + ]