Skip to content

Commit 01ff0c1

Browse files
nesitorolethanh
andauthored
Implemented GPU compatibility system (#747)
* Feature: Added options to get GPU compatibilities from a settings aggregate. * Fix: Refactored to also return the model name from the aggregate and use the same device_id format. * Fix: Include GPU list and move the VM egress IPv6 check on the connectivity check to start notifying the users about the next requirement. * Fix: Solved code quality issues. * Fix: Put definitive settings aggregate address * Fix: Solved issue with type casting and moved the aggregate check. * Check community payment flow (#751) * Implement community payment check WIP * isort * Check community flow at allocation * Community flow : fix after testing * mod Use singleton for the Setting Aggregate * fix test * Implement community wallet start time --------- Co-authored-by: Olivier Le Thanh Duong <[email protected]>
1 parent 7c9392c commit 01ff0c1

File tree

10 files changed

+548
-42
lines changed

10 files changed

+548
-42
lines changed

src/aleph/vm/conf.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from os.path import abspath, exists, isdir, isfile, join
1010
from pathlib import Path
1111
from subprocess import CalledProcessError, check_output
12-
from typing import Any, Literal, NewType
12+
from typing import Any, List, Literal, NewType
1313

1414
from aleph_message.models import Chain
1515
from aleph_message.models.execution.environment import HypervisorType
@@ -280,8 +280,10 @@ class Settings(BaseSettings):
280280
description="Enable GPU pass-through support to VMs, only allowed for QEmu hypervisor",
281281
)
282282

283-
# Tests on programs
283+
# Settings to get from the network aggregates
284+
SETTINGS_AGGREGATE_ADDRESS: str = "0xFba561a84A537fCaa567bb7A2257e7142701ae2A"
284285

286+
# Tests on programs
285287
FAKE_DATA_PROGRAM: Path | None = None
286288
BENCHMARK_FAKE_DATA_PROGRAM = Path(abspath(join(__file__, "../../../../examples/example_fastapi")))
287289

src/aleph/vm/orchestrator/tasks.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import math
55
import time
66
from collections.abc import AsyncIterable
7+
from decimal import Decimal
78
from typing import TypeVar
89

910
import aiohttp
@@ -19,6 +20,10 @@
1920
from yarl import URL
2021

2122
from aleph.vm.conf import settings
23+
from aleph.vm.orchestrator.utils import (
24+
get_community_wallet_address,
25+
is_after_community_wallet_start,
26+
)
2227
from aleph.vm.pool import VmPool
2328
from aleph.vm.utils import create_task_log_exceptions
2429

@@ -35,6 +40,7 @@
3540
logger = logging.getLogger(__name__)
3641

3742
Value = TypeVar("Value")
43+
COMMUNITY_STREAM_RATIO = Decimal(0.2)
3844

3945

4046
async def retry_generator(generator: AsyncIterable[Value], max_seconds: int = 8) -> AsyncIterable[Value]:
@@ -154,6 +160,7 @@ async def monitor_payments(app: web.Application):
154160
try:
155161
logger.debug("Monitoring balances task running")
156162
await check_payment(pool)
163+
logger.debug("Monitoring balances task ended")
157164
except Exception as e:
158165
# Catch all exceptions as to never stop the task.
159166
logger.warning(f"check_payment failed {e}", exc_info=True)
@@ -191,31 +198,62 @@ async def check_payment(pool: VmPool):
191198
logger.debug(f"Stopping {last_execution} due to insufficient balance")
192199
await pool.stop_vm(last_execution.vm_hash)
193200
required_balance = await compute_required_balance(executions)
201+
community_wallet = await get_community_wallet_address()
202+
if not community_wallet:
203+
logger.error("Monitor payment ERROR: No community wallet set. Cannot check community payment")
194204

195205
# Check if the balance held in the wallet is sufficient stream tier resources
196206
for sender, chains in pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items():
197207
for chain, executions in chains.items():
198208
try:
199209
stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain)
210+
200211
logger.debug(
201-
f"Get stream flow from Sender {sender} to Receiver {settings.PAYMENT_RECEIVER_ADDRESS} of {stream}"
212+
f"Stream flow from {sender} to {settings.PAYMENT_RECEIVER_ADDRESS} = {stream} {chain.value}"
202213
)
214+
except ValueError as error:
215+
logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}")
216+
continue
217+
try:
218+
community_stream = await get_stream(sender=sender, receiver=community_wallet, chain=chain)
219+
logger.debug(f"Stream flow from {sender} to {community_wallet} (community) : {stream} {chain}")
220+
203221
except ValueError as error:
204222
logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}")
205223
continue
206224

207-
required_stream = await compute_required_flow(executions)
208-
logger.debug(f"Required stream for Sender {sender} executions: {required_stream}")
209-
# Stop executions until the required stream is reached
210-
while (stream + settings.PAYMENT_BUFFER) < required_stream:
211-
try:
212-
last_execution = executions.pop(-1)
213-
except IndexError: # Empty list
214-
logger.debug("No execution can be maintained due to insufficient stream")
225+
while executions:
226+
executions_with_community = [
227+
execution
228+
for execution in executions
229+
if await is_after_community_wallet_start(execution.times.started_at)
230+
]
231+
232+
required_stream = await compute_required_flow(executions_with_community)
233+
executions_without_community = [
234+
execution
235+
for execution in executions
236+
if not await is_after_community_wallet_start(execution.times.started_at)
237+
]
238+
logger.info("flow community %s", executions_with_community)
239+
logger.info("flow without community %s", executions_without_community)
240+
required_stream_without_community = await compute_required_flow(executions_without_community)
241+
242+
required_crn_stream = required_stream * (1 - COMMUNITY_STREAM_RATIO) + required_stream_without_community
243+
required_community_stream = required_stream * COMMUNITY_STREAM_RATIO
244+
logger.debug(
245+
f"Stream for senders {sender} {len(executions)} executions. CRN : {stream} / {required_crn_stream}."
246+
f"Community: {community_stream} / {required_community_stream}"
247+
)
248+
# Can pay all executions
249+
if (stream + settings.PAYMENT_BUFFER) > required_crn_stream and (
250+
community_stream + settings.PAYMENT_BUFFER
251+
) > required_community_stream:
215252
break
216-
logger.debug(f"Stopping {last_execution} due to insufficient stream")
253+
# Stop executions until the required stream is reached
254+
last_execution = executions.pop(-1)
255+
logger.info(f"Stopping {last_execution} of {sender} due to insufficient stream")
217256
await pool.stop_vm(last_execution.vm_hash)
218-
required_stream = await compute_required_flow(executions)
219257

220258

221259
async def start_payment_monitoring_task(app: web.Application):

src/aleph/vm/orchestrator/utils.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from datetime import datetime, timedelta, timezone
2+
from logging import getLogger
3+
from typing import Any, TypedDict
4+
5+
import aiohttp
6+
7+
from aleph.vm.conf import settings
8+
9+
logger = getLogger(__name__)
10+
11+
12+
class AggregateSettingsDict(TypedDict):
13+
compatible_gpus: list[Any]
14+
community_wallet_address: str
15+
community_wallet_timestamp: int
16+
17+
18+
LAST_AGGREGATE_SETTINGS: AggregateSettingsDict | None = None
19+
LAST_AGGREGATE_SETTINGS_FETCHED_AT: datetime | None = None
20+
21+
22+
async def fetch_aggregate_settings() -> AggregateSettingsDict | None:
23+
"""
24+
Get the settings Aggregate dict from the PyAleph API Aggregate.
25+
26+
API Endpoint:
27+
GET /api/v0/aggregates/{address}.json?keys=settings
28+
29+
For more details, see the PyAleph API documentation:
30+
https://github.com/aleph-im/pyaleph/blob/master/src/aleph/web/controllers/routes.py#L62
31+
"""
32+
async with aiohttp.ClientSession() as session:
33+
url = f"{settings.API_SERVER}/api/v0/aggregates/{settings.SETTINGS_AGGREGATE_ADDRESS}.json?keys=settings"
34+
logger.info(f"Fetching settings aggregate from {url}")
35+
resp = await session.get(url)
36+
37+
# Raise an error if the request failed
38+
resp.raise_for_status()
39+
40+
resp_data = await resp.json()
41+
return resp_data["data"]["settings"]
42+
43+
44+
async def update_aggregate_settings():
45+
global LAST_AGGREGATE_SETTINGS # noqa: PLW0603
46+
global LAST_AGGREGATE_SETTINGS_FETCHED_AT # noqa: PLW0603
47+
48+
LAST_AGGREGATE_SETTINGS = await fetch_aggregate_settings()
49+
if (
50+
not LAST_AGGREGATE_SETTINGS
51+
or LAST_AGGREGATE_SETTINGS_FETCHED_AT
52+
and datetime.now(tz=timezone.utc) - LAST_AGGREGATE_SETTINGS_FETCHED_AT > timedelta(minutes=1)
53+
):
54+
try:
55+
aggregate = await fetch_aggregate_settings()
56+
LAST_AGGREGATE_SETTINGS = aggregate
57+
LAST_AGGREGATE_SETTINGS_FETCHED_AT = datetime.now(tz=timezone.utc)
58+
59+
except Exception:
60+
logger.exception("Failed to fetch aggregate settings")
61+
62+
63+
async def get_aggregate_settings() -> AggregateSettingsDict | None:
64+
"""The settings aggregate is a special aggregate used to share some common settings for VM setup
65+
66+
Ensure the cached version is up to date and return it"""
67+
await update_aggregate_settings()
68+
69+
if not LAST_AGGREGATE_SETTINGS:
70+
logger.error("No setting aggregate")
71+
return LAST_AGGREGATE_SETTINGS
72+
73+
74+
async def get_community_wallet_address() -> str | None:
75+
setting_aggr = await get_aggregate_settings()
76+
return setting_aggr and setting_aggr.get("community_wallet_address")
77+
78+
79+
async def get_community_wallet_start() -> datetime:
80+
"""Community wallet start time.
81+
82+
After this timestamp. New PAYG must include a payment to the community wallet"""
83+
setting_aggr = await get_aggregate_settings()
84+
if setting_aggr is None or "community_wallet_timestamp" not in setting_aggr:
85+
return datetime.now(tz=timezone.utc)
86+
timestamp = setting_aggr["community_wallet_timestamp"]
87+
start_datetime = datetime.fromtimestamp(timestamp, tz=timezone.utc)
88+
return start_datetime
89+
90+
91+
async def is_after_community_wallet_start(dt: datetime | None = None) -> bool:
92+
"""Community wallet start time"""
93+
if not dt:
94+
dt = datetime.now(tz=timezone.utc)
95+
start_dt = await get_community_wallet_start()
96+
return dt > start_dt
97+
98+
99+
def get_compatible_gpus() -> list[Any]:
100+
if not LAST_AGGREGATE_SETTINGS:
101+
return []
102+
return LAST_AGGREGATE_SETTINGS["compatible_gpus"]

src/aleph/vm/orchestrator/views/__init__.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import binascii
2-
import contextlib
32
import logging
43
from decimal import Decimal
54
from hashlib import sha256
@@ -8,7 +7,6 @@
87
from pathlib import Path
98
from secrets import compare_digest
109
from string import Template
11-
from typing import Optional
1210

1311
import aiodns
1412
import aiohttp
@@ -26,7 +24,7 @@
2624
from aleph.vm.controllers.firecracker.program import FileTooLargeError
2725
from aleph.vm.hypervisors.firecracker.microvm import MicroVMFailedInitError
2826
from aleph.vm.orchestrator import payment, status
29-
from aleph.vm.orchestrator.chain import STREAM_CHAINS, ChainInfo
27+
from aleph.vm.orchestrator.chain import STREAM_CHAINS
3028
from aleph.vm.orchestrator.custom_logs import set_vm_for_logging
3129
from aleph.vm.orchestrator.messages import try_get_message
3230
from aleph.vm.orchestrator.metrics import get_execution_records
@@ -39,6 +37,12 @@
3937
from aleph.vm.orchestrator.pubsub import PubSub
4038
from aleph.vm.orchestrator.resources import Allocation, VMNotification
4139
from aleph.vm.orchestrator.run import run_code_on_request, start_persistent_vm
40+
from aleph.vm.orchestrator.tasks import COMMUNITY_STREAM_RATIO
41+
from aleph.vm.orchestrator.utils import (
42+
get_community_wallet_address,
43+
is_after_community_wallet_start,
44+
update_aggregate_settings,
45+
)
4246
from aleph.vm.orchestrator.views.host_status import (
4347
check_dns_ipv4,
4448
check_dns_ipv6,
@@ -468,6 +472,7 @@ async def update_allocations(request: web.Request):
468472
@cors_allow_all
469473
async def notify_allocation(request: web.Request):
470474
"""Notify instance allocation, only used for Pay as you Go feature"""
475+
await update_aggregate_settings()
471476
try:
472477
data = await request.json()
473478
vm_notification = VMNotification.parse_obj(data)
@@ -526,16 +531,44 @@ async def notify_allocation(request: web.Request):
526531
raise web.HTTPPaymentRequired(reason="Empty payment stream for this instance")
527532

528533
required_flow: Decimal = await fetch_execution_flow_price(item_hash)
529-
530-
if active_flow < required_flow:
534+
community_wallet = await get_community_wallet_address()
535+
required_crn_stream: Decimal
536+
required_community_stream: Decimal
537+
if await is_after_community_wallet_start() and community_wallet:
538+
required_crn_stream = required_flow * (1 - COMMUNITY_STREAM_RATIO)
539+
required_community_stream = required_flow * COMMUNITY_STREAM_RATIO
540+
else: # No community wallet payment
541+
required_crn_stream = required_flow
542+
required_community_stream = Decimal(0)
543+
544+
if active_flow < (required_crn_stream - settings.PAYMENT_BUFFER):
531545
active_flow_per_month = active_flow * 60 * 60 * 24 * (Decimal("30.41666666666923904761904784"))
532-
required_flow_per_month = required_flow * 60 * 60 * 24 * Decimal("30.41666666666923904761904784")
546+
required_flow_per_month = required_crn_stream * 60 * 60 * 24 * Decimal("30.41666666666923904761904784")
533547
return web.HTTPPaymentRequired(
534548
reason="Insufficient payment stream",
535549
text="Insufficient payment stream for this instance\n\n"
536-
f"Required: {required_flow_per_month} / month (flow = {required_flow})\n"
550+
f"Required: {required_flow_per_month} / month (flow = {required_crn_stream})\n"
537551
f"Present: {active_flow_per_month} / month (flow = {active_flow})",
538552
)
553+
554+
if community_wallet and required_community_stream:
555+
community_flow: Decimal = await get_stream(
556+
sender=message.sender,
557+
receiver=community_wallet,
558+
chain=message.content.payment.chain,
559+
)
560+
if community_flow < (required_community_stream - settings.PAYMENT_BUFFER):
561+
active_flow_per_month = community_flow * 60 * 60 * 24 * (Decimal("30.41666666666923904761904784"))
562+
required_flow_per_month = (
563+
required_community_stream * 60 * 60 * 24 * Decimal("30.41666666666923904761904784")
564+
)
565+
return web.HTTPPaymentRequired(
566+
reason="Insufficient payment stream to community",
567+
text="Insufficient payment stream for community \n\n"
568+
f"Required: {required_flow_per_month} / month (flow = {required_community_stream})\n"
569+
f"Present: {active_flow_per_month} / month (flow = {community_flow})\n"
570+
f"Address: {community_wallet}",
571+
)
539572
else:
540573
return web.HTTPBadRequest(reason="Invalid payment method")
541574

src/aleph/vm/orchestrator/views/static/helpers.js

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,42 @@ async function fetchHostCheckStatus () {
5353
return res;
5454
}
5555

56+
async function fetchHostSystemUsage () {
57+
const q = await fetch('/about/usage/system');
58+
let res = {
59+
status: q.status,
60+
details: []
61+
}
62+
if(q.ok){
63+
const answer = await q.json();
64+
const gpu_devices = answer.gpu.devices;
65+
if (gpu_devices.length <= 0) {
66+
res.status = "<b>No GPUs detected</b>";
67+
}else{
68+
res.status = "<ul>";
69+
for (const gpu_device of gpu_devices){
70+
let compatible_str = " is compatible &#9989;";
71+
if (!gpu_device.compatible) {
72+
compatible_str = " isn't compatible &#10060;";
73+
}
74+
res.status += "<li><b>" + gpu_device.vendor + " | " + gpu_device.device_name + "</b>" + compatible_str + "</li>";
75+
}
76+
res.status += "</ul>";
77+
}
78+
}
79+
else {
80+
switch(Number(q.status)){
81+
case 500:
82+
res.status = "Getting Node usage failed &#10060;";
83+
break;
84+
default:
85+
res.status = q.status;
86+
}
87+
}
88+
89+
return res;
90+
}
91+
5692
function objectToString (obj) {
5793
return Object.entries(obj).reduce((acc, [k, v]) => acc + `<li>${k}: <span style="color: ${v ? 'green' : 'red'}">${v}</span></li>\n`, '');
5894
}

src/aleph/vm/orchestrator/views/static/main.css

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ body {
55
max-width: 800px;
66
}
77

8+
details {
9+
margin-top: 30px;
10+
}
11+
812
main {
913
width: 90vw;
1014
margin: 2vh auto;

0 commit comments

Comments
 (0)