Skip to content

Commit 330dfd6

Browse files
olethanhAntonyjin
authored andcommitted
Make vm_id assignment more robust (#714)
Remove the counter way to assign a vm_id as it didn't work reliably Jira ticket: ALEPH-272 That method was broken when persitent instances were loaded at start up. Since the "new" feature that allow persistent instance across aleph-vm reboot if one was started then aleph-vm was stopped and restarted the counter method could reassign the ip and break the existing vm's. Secundary reason was that the feature wasn't working properly with the default settings, as `2**available_bits` returned 1. So that code path was only used if the node owner tweaked some undocumented settings making it hard to identify and debug in prod nodes.
1 parent 9d0e30e commit 330dfd6

File tree

1 file changed

+10
-25
lines changed

1 file changed

+10
-25
lines changed

src/aleph/vm/pool.py

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,13 @@
2828

2929

3030
class VmPool:
31-
"""Pool of VMs already started and used to decrease response time.
31+
"""Pool of existing VMs
32+
33+
For function VM we keep the VM a while after they have run, so we can reuse them and thus decrease response time.
3234
After running, a VM is saved for future reuse from the same function during a
3335
configurable duration.
34-
35-
The counter is used by the VMs to set their tap interface name and the corresponding
36-
IPv4 subnet.
3736
"""
3837

39-
counter: int # Used to provide distinct ids to network interfaces
4038
executions: dict[ItemHash, VmExecution]
4139
message_cache: dict[str, ExecutableMessage]
4240
network: Network | None
@@ -45,7 +43,6 @@ class VmPool:
4543
creation_lock: asyncio.Lock
4644

4745
def __init__(self, loop: asyncio.AbstractEventLoop):
48-
self.counter = settings.START_ID_INDEX
4946
self.executions = {}
5047
self.message_cache = {}
5148

@@ -150,25 +147,13 @@ def get_unique_vm_id(self) -> int:
150147
This identifier is used to name the network interface and in the IPv4 range
151148
dedicated to the VM.
152149
"""
153-
_, network_range = settings.IPV4_ADDRESS_POOL.split("/")
154-
available_bits = int(network_range) - settings.IPV4_NETWORK_PREFIX_LENGTH
155-
self.counter += 1
156-
if self.counter < 2**available_bits:
157-
# In common cases, use the counter itself as the vm_id. This makes it
158-
# easier to debug.
159-
return self.counter
160-
else:
161-
# The value of the counter is too high and some functions such as the
162-
# IPv4 range dedicated to the VM do not support such high values.
163-
#
164-
# We therefore recycle vm_id values from executions that are not running
165-
# anymore.
166-
currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
167-
for i in range(settings.START_ID_INDEX, 255**2):
168-
if i not in currently_used_vm_ids:
169-
return i
170-
msg = "No available value for vm_id."
171-
raise ValueError(msg)
150+
# Take the first id that is not already taken
151+
currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
152+
for i in range(settings.START_ID_INDEX, 255**2):
153+
if i not in currently_used_vm_ids:
154+
return i
155+
msg = "No available value for vm_id."
156+
raise ValueError(msg)
172157

173158
def get_running_vm(self, vm_hash: ItemHash) -> VmExecution | None:
174159
"""Return a running VM or None. Disables the VM expiration task."""

0 commit comments

Comments
 (0)